Exemplo n.º 1
0
    parser.add_argument("--smoke-test",
                        action="store_true",
                        help="Finish quickly for testing")

    args, _ = parser.parse_known_args()
    if not args.smoke_test and repo.is_dirty():
        raise RepositoryDirtyError(
            repo, "Have you forgotten to commit the changes?")

    corpus_size = 100

    config = {
        # A trick to log the SHA of the git HEAD.
        "SHA": tune.grid_search([sha]),
        "corpus_size": tune.grid_search([corpus_size]),
        "margin": tune.loguniform(0.0001, 0.2),
        "lr": tune.loguniform(0.001, 0.1),
        "batch_size": tune.grid_search([300]),
        "num_epochs": max(1000000 // corpus_size, 1),
        "test_freq": max(10000 // corpus_size, 1),
        "seed": 0,
    }

    analysis = tune.run(
        TrainBigramNN,
        name=experiment_name,
        config=config,
        num_samples=1 if args.smoke_test else 1000,
        # trial_name_creator=trial_str_creator,
        resources_per_trial={
            "cpu": 4,
Exemplo n.º 2
0
    parser.add_argument(
        "--server-address",
        type=str,
        default=None,
        required=False,
        help="The address of server to connect to if using Ray Client.",
    )
    args, _ = parser.parse_known_args()

    if args.server_address:
        import ray

        ray.init(f"ray://{args.server_address}")

    analysis = tune.run(
        easy_objective,
        name="hyperband_test",
        metric="mean_loss",
        mode="min",
        num_samples=5,
        trial_name_creator=trial_str_creator,
        callbacks=[TestLoggerCallback()],
        stop={"training_iteration": 1 if args.smoke_test else 100},
        config={
            "steps": 100,
            "width": tune.randint(10, 100),
            "height": tune.loguniform(10, 100),
        },
    )
    print("Best hyperparameters: ", analysis.best_config)
Exemplo n.º 3
0
        "prioritized_replay_beta_annealing_timesteps": 20000,
        # Epsilon to add to the TD errors when updating priorities.
        "prioritized_replay_eps": 0.0,
        # Whether to LZ4 compress observations
        "compress_observations": True,
        # Callback to run before learning on a multi-agent batch of experiences.
        # "before_learn_on_batch": debug_before_learn_on_batch,
        # If set, this will fix the ratio of replayed from a buffer and learned on
        # timesteps to sampled from an environment and stored in the replay buffer
        # timesteps. Otherwise, the replay will proceed at the native ratio
        # determined by (train_batch_size / rollout_fragment_length).
        "training_intensity": None,

        # === Optimization ===
        # Learning rate for adam optimizer
        "lr": loguniform(0.0001, 0.1),
        # Learning rate schedule
        "lr_schedule": None,
        # Adam epsilon hyper parameter
        "adam_epsilon": choice([1e-8, 1e-6, 1e-4, 1e-2]),
        # If not None, clip gradients during optimization at this value
        "grad_clip": None,
        # How many steps of the model to sample before learning starts.
        "learning_starts": 16000,
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.q
        "rollout_fragment_length": choice([4, 8, 16, 32]),
        "batch_mode": "truncate_episodes",

        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
Exemplo n.º 4
0
def optimize(train_x, train_y, test_x, test_y):
    import sklearn.datasets
    import sklearn.metrics
    import os
    from ray.tune.schedulers import ASHAScheduler
    from sklearn.model_selection import train_test_split
    import xgboost as xgb

    from ray import tune
    from ray.tune.integration.xgboost import TuneReportCheckpointCallback

    def train_breast_cancer(config: dict):
        # This is a simple training function to be passed into Tune
        # Load dataset
        # Split into train and test set
        # Build input matrices for XGBoost
        train_set = xgb.DMatrix(train_x, label=train_y)
        test_set = xgb.DMatrix(test_x, label=test_y)
        # Train the classifier, using the Tune callback
        xgb.train(
            config,
            train_set,
            evals=[(test_set, "eval")],
            verbose_eval=False,
            callbacks=[TuneReportCheckpointCallback(filename="model.xgb")])

    if __name__ == "__main__":
        search_space = {
            # You can mix constants with search space objects.
            "objective": "binary:logistic",
            "eval_metric": ["logloss", "error"],
            "max_depth": tune.randint(1, 9),
            "min_child_weight": tune.choice([1, 2, 3]),
            "subsample": tune.uniform(0.5, 1.0),
            "eta": tune.loguniform(1e-4, 1e-1)
        }
        # This will enable aggressive early stopping of bad trials.
        scheduler = ASHAScheduler(
            max_t=10,  # 10 training iterations
            grace_period=1,
            reduction_factor=2)

        analysis = tune.run(
            train_breast_cancer,
            metric="eval-logloss",
            mode="min",
            # You can add "gpu": 0.1 to allocate GPUs
            resources_per_trial={"cpu": 1},
            config=search_space,
            num_samples=10,
            scheduler=scheduler)

        # Load the best model checkpoint
        best_bst = xgb.Booster()
        best_bst.load_model(os.path.join(analysis.best_checkpoint,
                                         "model.xgb"))
        accuracy = 1. - analysis.best_result["eval-error"]
        print(f"Best model parameters: {analysis.best_config}")
        print(f"Best model total accuracy: {accuracy:.4f}")

        # You could now do further predictions with
        # best_bst.predict(...)

    log(action_logging_enum=INFO,
        logging_text=
        "[EXTREME GRADIENT BOOSTING]: Starting to search for best number of trees by cross validating different values."
        )

    # read data
    train = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE)
    train = transform_data(train)
    x_train = train.drop(['Label'], axis=1)
    y_train = train['Label'].copy()

    # Create regularization penalty space
    n_estimators = [40, 50, 60, 80, 100, 120, 140, 160]
    min_samples_leaf = [1, 2, 3, 4, 5]
    min_samples_split = [3, 4, 5, 6, 7, 8, 9, 10]
    max_features = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35]
    model = XGBClassifier()

    # Create hyperparameter options
    hyperparameters = dict(min_samples_split=min_samples_split,
                           min_samples_leaf=min_samples_leaf,
                           n_estimators=n_estimators,
                           max_features=max_features)

    # Create grid search using 10-fold cross validation
    clf = GridSearchCV(model, hyperparameters, cv=10, verbose=0)
    best_model = clf.fit(x_train, y_train)

    # View best hyperparameters
    # Best estimators: 60
    # Best samples leaf: 1
    # Best samples split: 3
    # Best features: 5
    log(action_logging_enum=INFO,
        logging_text="[EXTREME GRADIENT BOOSTING]: Optimization completed.")
    log(
        INFO,
        str('Best estimators:',
            best_model.best_estimator_.get_params()['n_estimators']))
    log(
        INFO,
        str('Best samples leaf:',
            best_model.best_estimator_.get_params()['min_samples_leaf']))
    log(
        INFO,
        str('Best samples split:',
            best_model.best_estimator_.get_params()['min_samples_split']))
    log(
        INFO,
        str('Best features:',
            best_model.best_estimator_.get_params()['max_features']))
Exemplo n.º 5
0
                        name=expname)

    print("Best Parameters:", analysis.best_config)

    analysis.best_result_df.to_csv("best_parameters_exp%s_trials%d.csv" %
                                   (expname, ntrials))
    analysis.results_df.to_csv("all_results_exp%s_trials%d.csv" %
                               (expname, ntrials))
    print("Best 5 results")
    print(analysis.results_df.sort_values(by="mcc", ascending=False).head(5))


# +
default_mpl = {
    "structure": "mpl",
    "learning_rate": tune.loguniform(1e-6, 1e-1),
    "batch_size": tune.choice([32, 64, 128, 256]),
    "monitor": tune.choice(["loss", "mcc"]),
    "shared_output_size": tune.randint(2, 256),
    "opt_step_size": tune.randint(1, 20),
    "weight_decay": tune.loguniform(1e-5, 1e-2),
    "dropout_input_layers": tune.uniform(0, 1),
    "dropout_inner_layers": tune.uniform(0, 1),
}

default_lstm = {
    "structure": "lstm",
    "learning_rate": tune.loguniform(1e-6, 1e-1),
    "batch_size": tune.choice([32, 64, 128, 256]),
    "bidirectional": tune.choice([True, False]),
    "num_layers": tune.choice([1, 2]),
Exemplo n.º 6
0
def train_wrapper(config, ray_params):
    train_ray(
        path="/data/classification.parquet",
        num_workers=4,
        num_boost_rounds=100,
        num_files=64,
        regression=False,
        use_gpu=False,
        ray_params=ray_params,
        xgboost_params=config,
    )


if __name__ == "__main__":
    search_space = {
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    ray.init(address="auto")

    ray_params = RayParams(elastic_training=False,
                           max_actor_restarts=2,
                           num_actors=4,
                           cpus_per_actor=1,
                           gpus_per_actor=0)

    analysis = tune.run(tune.with_parameters(train_wrapper,
                                             ray_params=ray_params),
                        config=search_space,
Exemplo n.º 7
0
def hp_space(trial):
    return dict(learning_rate=tune.loguniform(1e-4, 1e-2))
Exemplo n.º 8
0
    def test_tuner_with_xgboost_trainer_driver_fail_and_resume(self):
        # So that we have some global checkpointing happening.
        os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "1"
        shutil.rmtree(
            os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail"),
            ignore_errors=True,
        )
        trainer = XGBoostTrainer(
            label_column="target",
            params={},
            # TODO(xwjiang): change when dataset out-of-band ser/des is landed.
            datasets={"train": gen_dataset_func_eager()},
        )
        # prep_v1 = StandardScaler(["worst radius", "worst area"])
        # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"])
        param_space = {
            "scaling_config": {
                "num_workers": tune.grid_search([1, 2]),
            },
            # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363
            #  is resolved.
            # "preprocessor": tune.grid_search([prep_v1, prep_v2]),
            # "datasets": {
            #     "train": tune.choice(
            #         [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]
            #     ),
            # },
            "params": {
                "objective": "binary:logistic",
                "tree_method": "approx",
                "eval_metric": ["logloss", "error"],
                "eta": tune.loguniform(1e-4, 1e-1),
                "subsample": tune.uniform(0.5, 1.0),
                "max_depth": tune.randint(1, 9),
            },
        }

        class FailureInjectionCallback(Callback):
            """Inject failure at the configured iteration number."""

            def __init__(self, num_iters=10):
                self.num_iters = num_iters

            def on_step_end(self, iteration, trials, **kwargs):
                if iteration == self.num_iters:
                    print(f"Failing after {self.num_iters} iters.")
                    raise RuntimeError

        tuner = Tuner(
            trainable=trainer,
            run_config=RunConfig(
                name="test_tuner_driver_fail", callbacks=[FailureInjectionCallback()]
            ),
            param_space=param_space,
            tune_config=TuneConfig(mode="min", metric="train-error"),
        )
        with self.assertRaises(TuneError):
            tuner.fit()

        # Test resume
        restore_path = os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail")
        tuner = Tuner.restore(restore_path)
        # A hack before we figure out RunConfig semantics across resumes.
        tuner._local_tuner._run_config.callbacks = None
        results = tuner.fit()
        assert len(results) == 2
Exemplo n.º 9
0
def cifar10_main(method="BlendSearch",
                 num_samples=10,
                 max_num_epochs=100,
                 gpus_per_trial=1):
    data_dir = os.path.abspath("test/data")
    load_data(data_dir)  # Download data for all trials before starting the run
    if method == "BlendSearch":
        from flaml import tune
    else:
        from ray import tune
    if method in ["BOHB"]:
        config = {
            "l1": tune.randint(2, 8),
            "l2": tune.randint(2, 8),
            "lr": tune.loguniform(1e-4, 1e-1),
            "num_epochs": tune.qloguniform(1, max_num_epochs, q=1),
            "batch_size": tune.randint(1, 4),
        }
    else:
        config = {
            "l1": tune.randint(2, 9),
            "l2": tune.randint(2, 9),
            "lr": tune.loguniform(1e-4, 1e-1),
            "num_epochs": tune.loguniform(1, max_num_epochs),
            "batch_size": tune.randint(1, 5),
        }
    import ray

    time_budget_s = 600
    np.random.seed(7654321)
    start_time = time.time()
    if method == "BlendSearch":
        result = tune.run(
            ray.tune.with_parameters(train_cifar, data_dir=data_dir),
            config=config,
            metric="loss",
            mode="min",
            low_cost_partial_config={"num_epochs": 1},
            max_resource=max_num_epochs,
            min_resource=1,
            scheduler="asha",
            resources_per_trial={
                "cpu": 1,
                "gpu": gpus_per_trial
            },
            local_dir="logs/",
            num_samples=num_samples,
            time_budget_s=time_budget_s,
            use_ray=True,
        )
    else:
        if "ASHA" == method:
            algo = None
        elif "BOHB" == method:
            from ray.tune.schedulers import HyperBandForBOHB
            from ray.tune.suggest.bohb import TuneBOHB

            algo = TuneBOHB()
            scheduler = HyperBandForBOHB(max_t=max_num_epochs)
        elif "Optuna" == method:
            from ray.tune.suggest.optuna import OptunaSearch

            algo = OptunaSearch(seed=10)
        elif "CFO" == method:
            from flaml import CFO

            algo = CFO(low_cost_partial_config={
                "num_epochs": 1,
            })
        elif "Nevergrad" == method:
            from ray.tune.suggest.nevergrad import NevergradSearch
            import nevergrad as ng

            algo = NevergradSearch(optimizer=ng.optimizers.OnePlusOne)
        if method != "BOHB":
            from ray.tune.schedulers import ASHAScheduler

            scheduler = ASHAScheduler(max_t=max_num_epochs, grace_period=1)
        result = tune.run(
            tune.with_parameters(train_cifar, data_dir=data_dir),
            resources_per_trial={
                "cpu": 1,
                "gpu": gpus_per_trial
            },
            config=config,
            metric="loss",
            mode="min",
            num_samples=num_samples,
            time_budget_s=time_budget_s,
            scheduler=scheduler,
            search_alg=algo,
        )
    ray.shutdown()

    logger.info(f"method={method}")
    logger.info(f"#trials={len(result.trials)}")
    logger.info(f"time={time.time()-start_time}")
    best_trial = result.get_best_trial("loss", "min", "all")
    logger.info("Best trial config: {}".format(best_trial.config))
    logger.info("Best trial final validation loss: {}".format(
        best_trial.metric_analysis["loss"]["min"]))
    logger.info("Best trial final validation accuracy: {}".format(
        best_trial.metric_analysis["accuracy"]["max"]))

    best_trained_model = Net(2**best_trial.config["l1"],
                             2**best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")

    model_state, optimizer_state = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    test_acc = _test_accuracy(best_trained_model, device)
    logger.info("Best trial test set accuracy: {}".format(test_acc))
Exemplo n.º 10
0
    def testTuneSampleAPI(self):
        config = {
            "func": tune.sample_from(lambda spec: spec.config.uniform * 0.01),
            "uniform": tune.uniform(-5, -1),
            "quniform": tune.quniform(3.2, 5.4, 0.2),
            "loguniform": tune.loguniform(1e-4, 1e-2),
            "qloguniform": tune.qloguniform(1e-4, 1e-1, 5e-5),
            "choice": tune.choice([2, 3, 4]),
            "randint": tune.randint(-9, 15),
            "lograndint": tune.lograndint(1, 10),
            "qrandint": tune.qrandint(-21, 12, 3),
            "qlograndint": tune.qlograndint(2, 20, 2),
            "randn": tune.randn(10, 2),
            "qrandn": tune.qrandn(10, 2, 0.2),
        }
        for _, (_, generated) in zip(range(1000),
                                     generate_variants({"config": config})):
            out = generated["config"]

            self.assertAlmostEqual(out["func"], out["uniform"] * 0.01)

            self.assertGreaterEqual(out["uniform"], -5)
            self.assertLess(out["uniform"], -1)

            self.assertGreaterEqual(out["quniform"], 3.2)
            self.assertLessEqual(out["quniform"], 5.4)
            self.assertAlmostEqual(out["quniform"] / 0.2,
                                   round(out["quniform"] / 0.2))

            self.assertGreaterEqual(out["loguniform"], 1e-4)
            self.assertLess(out["loguniform"], 1e-2)

            self.assertGreaterEqual(out["qloguniform"], 1e-4)
            self.assertLessEqual(out["qloguniform"], 1e-1)
            self.assertAlmostEqual(out["qloguniform"] / 5e-5,
                                   round(out["qloguniform"] / 5e-5))

            self.assertIn(out["choice"], [2, 3, 4])

            self.assertGreaterEqual(out["randint"], -9)
            self.assertLess(out["randint"], 15)
            self.assertTrue(isinstance(out["randint"], int))

            self.assertGreaterEqual(out["lograndint"], 1)
            self.assertLess(out["lograndint"], 10)
            self.assertTrue(isinstance(out["lograndint"], int))

            self.assertGreaterEqual(out["qrandint"], -21)
            self.assertLessEqual(out["qrandint"], 12)
            self.assertEqual(out["qrandint"] % 3, 0)
            self.assertTrue(isinstance(out["qrandint"], int))

            self.assertGreaterEqual(out["qlograndint"], 2)
            self.assertLessEqual(out["qlograndint"], 20)
            self.assertEqual(out["qlograndint"] % 2, 0)
            self.assertTrue(isinstance(out["qlograndint"], int))

            # Very improbable
            self.assertGreater(out["randn"], 0)
            self.assertLess(out["randn"], 20)

            self.assertGreater(out["qrandn"], 0)
            self.assertLess(out["qrandn"], 20)
            self.assertAlmostEqual(out["qrandn"] / 0.2,
                                   round(out["qrandn"] / 0.2))
Exemplo n.º 11
0
    def tune_train(args,
                   model_class,
                   task_info: TaskInfo,
                   build_method=default_build_method,
                   model_kwargs: dict = None,
                   tune_config=None):
        if model_kwargs is None:
            model_kwargs = {}
        this_time = time.strftime("%m-%d_%H:%M:%S", time.localtime())
        experiment_name = f'{task_info.task_name}_{this_time}'

        if tune_config is None:
            config = {
                # 3e-4 for Small, 1e-4 for Base, 5e-5 for Large
                "lr":
                tune.loguniform(args.tune_min_lr, args.tune_max_lr),

                # -1 for disable, 0.8 for Base/Small, 0.9 for Large
                "layerwise_lr_decay_power":
                tune.choice([0.8, 0.9]),

                # lr scheduler
                "lr_scheduler":
                tune.choice([
                    'linear_schedule_with_warmup',
                    'polynomial_decay_schedule_with_warmup'
                ]),
            }
        else:
            config = tune_config
        if torch.cuda.is_available():
            resources_per_trial = {
                "cpu": args.tune_cpus_per_trial,
                "gpu": args.tune_gpus_per_trial
            }
        else:
            resources_per_trial = {"cpu": args.tune_cpus_per_trial}
        print("resources_per_trial", resources_per_trial)

        tune_dir = os.path.abspath('tune_lightning_logs')

        analysis = tune.run(
            tune.with_parameters(
                tune_train_once,
                args=args,
                task_info=task_info,
                model_class=model_class,
                build_method=build_method,
                model_kwargs=model_kwargs,
                resume=args.tune_resume,
                group=experiment_name,
                log_dir=tune_dir,
            ),
            mode="max",
            config=config,
            num_samples=args.tune_num_samples,
            metric=f'tune_{task_info.metric_name}',
            name=experiment_name,
            progress_reporter=CLIReporter(
                parameter_columns=list(config.keys()),
                metric_columns=[
                    "loss", f'tune_{task_info.metric_name}',
                    "training_iteration"
                ]),
            callbacks=[TBXLoggerCallback(),
                       CSVLoggerCallback()],
            resources_per_trial=resources_per_trial,
            scheduler=ASHAScheduler(
                max_t=args.max_epochs + 1,  # for test
                grace_period=args.min_epochs),
            queue_trials=True,
            keep_checkpoints_num=args.tune_keep_checkpoints_num,
            checkpoint_score_attr=f'tune_{task_info.metric_name}',
            local_dir=tune_dir,
        )
        print("Best hyperparameters found were: ", analysis.best_config)
        print("Best checkpoint: ", analysis.best_checkpoint)

        args_vars = vars(args)
        args_vars.update(analysis.best_config)
        model = model_class.load_from_checkpoint(os.path.join(
            analysis.best_checkpoint, "tune.ckpt"),
                                                 hparams=args,
                                                 **model_kwargs)

        pl_loggers = [
            loggers.CSVLogger(save_dir=tune.get_trial_dir(),
                              name="",
                              version="."),
            loggers.TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                      name="",
                                      version=".",
                                      default_hp_metric=False),
        ]

        try:
            import wandb
            pl_loggers.append(
                loggers.WandbLogger(save_dir=tune_dir,
                                    project=args.project,
                                    name=tune.get_trial_name(),
                                    id=tune.get_trial_id(),
                                    offline=args.offline,
                                    group=experiment_name))
        except Exception:
            pass

        trainer: Trainer = Trainer.from_argparse_args(args, logger=pl_loggers)
        build_method(model, task_info)
        trainer.test(model)
Exemplo n.º 12
0
    # Get the empty gpu
    gpu.get_empty_gpu()

# /tmp is not accessible on GABA use the following dir:
ray.init(temp_dir='/tmpscratch/alik/runlogs/ray/')


# Log uniform function
def lognuniform(low=0, high=1, base=np.e):
    size = 1
    return int(np.power(base, np.random.uniform(low, high, size)))


# random search space definition, the loaders are added since I'm not sure how the trainable is called inside tune. # TODO
space = {
    "lr": tune.loguniform(1e-6, 0.1),
    "momentum": tune.loguniform(0.8, 0.9999),
    "n_latent": tune.choice(list(range(
        100, 10000))),  # tune.sample_from(lambda _:lognuniform(2, 4, 10)),
    "n_fmaps": tune.choice(list(range(4, 16))),
    "validation_loader": validation_loader,
    "train_loader": train_loader
}

analysis = tune.run(trainable,
                    config=space,
                    num_samples=100,
                    resources_per_trial={
                        'gpu': 1,
                        'cpu': 4
                    },
Exemplo n.º 13
0
        use_gpu=False,
        trainer_resources={"CPU": 0},  # so that the example works on Colab.
    ),
    datasets={"train": train_dataset},
    preprocessor=preprocessor,
)
# Execute training.
result = trainer.fit()
print(f"Last result: {result.metrics}")
# Last result: {'loss': 0.6559339960416158, ...}
# __air_pytorch_train_end__

# __air_pytorch_tuner_start__
from ray import tune

param_space = {"train_loop_config": {"lr": tune.loguniform(0.0001, 0.01)}}
metric = "loss"
# __air_pytorch_tuner_end__

# __air_tune_generic_start__
from ray.tune.tuner import Tuner, TuneConfig
from ray.air.config import RunConfig

tuner = Tuner(
    trainer,
    param_space=param_space,
    tune_config=TuneConfig(num_samples=5, metric=metric, mode="min"),
)
# Execute tuning.
result_grid = tuner.fit()
Exemplo n.º 14
0
    bst = xgb.train(
        config, train_set, evals=[(test_set, "eval")], callbacks=[XGBCallback])
    preds = bst.predict(test_set)
    pred_labels = np.rint(preds)
    tune.report(
        mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels),
        done=True)


if __name__ == "__main__":
    num_threads = 2
    config = {
        "verbosity": 0,
        "num_threads": num_threads,
        "objective": "binary:logistic",
        "booster": "gbtree",
        "eval_metric": ["auc", "ams@0", "logloss"],
        "max_depth": tune.randint(1, 9),
        "eta": tune.loguniform(1e-4, 1e-1),
        "gamma": tune.loguniform(1e-8, 1.0),
        "grow_policy": tune.choice(["depthwise", "lossguide"])
    }

    from ray.tune.schedulers import ASHAScheduler
    tune.run(
        train_breast_cancer,
        resources_per_trial={"cpu": num_threads},
        config=config,
        num_samples=2,
        scheduler=ASHAScheduler(metric="eval-logloss", mode="min"))
Exemplo n.º 15
0
data_paths = {
    'train': './data/train/',
    'val': './data/val/'
}

NUM_CPU_PER_TRIAL = os.cpu_count()
NUM_GPU_PER_TRIAL = 1 
resources_per_trial = {"gpu": NUM_GPU_PER_TRIAL, "cpu": NUM_CPU_PER_TRIAL}

MAX_TRAINING_EPOCH_PER_TRIAL = 15

SCHEDULER_GAMMA = 0.3

param_priority = ['lr','step','momentum','weight_decay','batch_size']

param_space = {
    'lr': tune.loguniform(1e-5,1e-1),
    'momentum': tune.uniform(0.5,0.99),
    'step': tune.choice([1,2,3]),
    'weight_decay': tune.loguniform(1e-8,1e-5),
    'batch_size': tune.choice([2**k for k in range(7,10)])
}

param_defaults = {
    'lr': 1e-2,
    'momentum': 0.9,
    'step': 2,
    'weight_decay': 1e-7,
    'batch_size': 128
}
Exemplo n.º 16
0
def searchBestHypers(num_samples=10,
                     max_num_epochs=15,
                     n_epochs_stop=2,
                     grace_period=5,
                     gpus_per_trial=0,
                     data_obj=None):
    import os
    os.chdir('/content/drive/My Drive/DL project/')
    assert data_obj is not None

    experiment_id = 'no_name_yet'

    config_schedule = {
        "batch_size": tune.choice([4, 8, 16, 32]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "h1": tune.sample_from(lambda: 2**np.random.randint(3, 8)),
        "h2": tune.sample_from(lambda: 2**np.random.randint(3, 8)),
        "wd": tune.loguniform(1e-4, 1e-1),
    }

    scheduler = ASHAScheduler(metric="loss",
                              mode="min",
                              max_t=max_num_epochs,
                              grace_period=1,
                              reduction_factor=2)

    pbt = PopulationBasedTraining(time_attr="training_iteration",
                                  metric="loss",
                                  mode="min",
                                  perturbation_interval=4,
                                  hyperparam_mutations={
                                      "batch_size": [8, 16, 32, 64, 128],
                                      "lr": tune.loguniform(1e-4, 1e-1),
                                      "h1": [4, 8, 16, 32, 64],
                                      "h2": [4, 8, 16, 32, 64],
                                      "wd": tune.loguniform(1e-4, 1e-1),
                                  })

    reporter = CLIReporter(metric_columns=["loss", "training_iteration"])

    result = tune.run(partial(train_cgm,
                              data_obj=data_obj,
                              n_epochs_stop=n_epochs_stop,
                              max_epochs=max_num_epochs,
                              grace_period=grace_period),
                      resources_per_trial={
                          "cpu": 1,
                          "gpu": gpus_per_trial
                      },
                      config=config_schedule,
                      num_samples=num_samples,
                      scheduler=scheduler,
                      progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))

    # Build best network
    best_trained_model = DilatedNet(h1=best_trial.config["h1"],
                                    h2=best_trial.config["h2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value

    print("BEST MODEL DIR: ", best_checkpoint_dir)
    model_state, optimizer_state = torch.load(
        os.path.join(best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    # Call load to fit scaler. Should be a better solution
    trainset, valset = data_obj.load_train_and_val()

    test_rmse_val = test_rmse(best_trained_model, data_obj)
    print("Best trial test set rmse: {}".format(test_rmse_val))

    # Save the results
    experiment = {
        'name': str(experiment_id),
        'best_trial_dir': str(best_checkpoint_dir),
        'train_data': str(data_obj.train_data),
        'test_data': str(data_obj.test_data),
        'start_date_train': str(data_obj.start_date_train),
        'start_date_test': str(data_obj.start_date_test),
        'end_date_train': str(data_obj.end_date_train),
        'end_date_test': str(data_obj.end_date_test)
    }

    current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')
    user = getpass.getuser()
    experiment_id = f'id_{current_time}_{user}'
    experiment_path = code_path / 'hyper_experiments'  # / model_id
    experiment_path.mkdir(exist_ok=True, parents=True)

    with open(experiment_path / (experiment_id + '.json'), 'w') as outfile:
        json.dump(experiment, outfile, indent=4)
    ''' Optinally Print information on where optimal model is saved '''
    #print("\n Experiment details are saved in:\n", experiment_path / (experiment_id + '.json'))
    #print("\n Checkpoint for best configuration issaved in:\n", best_checkpoint_dir)

    return experiment_id
Exemplo n.º 17
0
    utils_params.save_config(run_paths['path_gin'], gin.config_str())

    # setup pipeline
    train_ds, valid_ds, test_ds = datasets.load()

    # model
    model = DenseNet121(IMG_SIZE=256)

    trainer = Trainer(model=model, ds_train=train_ds, ds_val=test_ds, run_paths=run_paths)
    for val_accuracy in trainer.train():
        tune.report(val_accuracy=val_accuracy)
# some question about tune api

analysis = tune.run(
    train_func, num_samples=100, resources_per_trial={'gpu': 1, 'cpu': 2},
    config={
        "Trainer.total_steps": tune.grid_search([5000]),
        "Trainer.total_steps_ft": tune.randint(300, 1500),
        "Trainer.lr": tune.loguniform(1e-3, 1e-2),
        "Trainer.lr_ft": tune.loguniform(1e-6, 1e-4),
        "Trainer.ft_layer_idx": tune.randint(100, 300),
        "DenseNet121.dense_units": tune.randint(2, 64),
        "DenseNet121.dropout_rate": tune.uniform(0, 0.9),
        "DenseNet121.idx_layer": tune.randint(200, 400)
    })

print("Best config: ", analysis.get_best_config(metric="val_accuracy", mode="max"))

# Get a dataframe for analyzing trial results.
df = analysis.dataframe()
Exemplo n.º 18
0
def trainable(config):
    # config (dict): A dict of hyperparameters.

    for x in range(20):
        score = objective(x, config["a"], config["b"])

        tune.track.log(score=score)  # This sends the score to Tune.


# class based API example
class Trainable(tune.Trainable):
    def _setup(self, config):
        # config (dict): A dict of hyperparameters
        self.x = 0
        self.a = config["a"]
        self.b = config["b"]

    def _train(self):  # This is called iteratively.
        score = objective(self.x, self.a, self.b)
        self.x += 1
        return {"score": score}


# Run with a and b uniformly sampled from (-1,1)
space = {"a": tune.loguniform(1e-8, 1), "b": tune.uniform(1e-8, 1)}
analysis = tune.run(trainable, config=space, num_samples=100, resources_per_trial={'gpu': 1, 'cpu': 4})

# Use analysis object
df = analysis.dataframe(metric="score", mode="max")
print(df)
Exemplo n.º 19
0
#  resources per trial (kp_search_kwargs)

debug_finetuning_hp_search = deepcopy(finetuning_bert100k_glue)
debug_finetuning_hp_search.update(
    model_name_or_path=
    "/mnt/efs/results/pretrained-models/transformers-local/bert_100k",  # noqa: E501
    task_name=None,
    task_names=["cola", "rte"],
    num_runs=1,
    max_steps=200,
    save_steps=1,
    warmup_ratio=0.1,
    hp_validation_dataset_pct=1.0,
    report_to="none",
    task_hyperparams=dict(
        cola=dict(hp_space=lambda trial: dict(learning_rate=tune.loguniform(
            1e-5, 1e-2)),
                  hp_num_trials=3,
                  hp_compute_objective=("maximize",
                                        "eval_matthews_correlation")),
        rte=dict(hp_space=lambda trial: dict(learning_rate=tune.loguniform(
            1e-5, 1e-2)),
                 hp_num_trials=3,
                 hp_compute_objective=("maximize", "eval_accuracy")),
    ),
)

debug_finetuning_sparse_hp_search = deepcopy(
    finetuning_bert_sparse_85_trifecta_100k_glue_get_info)
debug_finetuning_sparse_hp_search.update(
    task_name="cola",
    task_names=None,
    gbm = lgb.train(config,
                    train_set,
                    valid_sets=[test_set],
                    verbose_eval=False,
                    callbacks=[LightGBMCallback])
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    tune.report(mean_accuracy=sklearn.metrics.accuracy_score(
        test_y, pred_labels),
                done=True)


if __name__ == "__main__":
    config = {
        "objective": "binary",
        "metric": "binary_error",
        "verbose": -1,
        "boosting_type": tune.grid_search(["gbdt", "dart"]),
        "num_leaves": tune.randint(10, 1000),
        "learning_rate": tune.loguniform(1e-8, 1e-1)
    }

    analysis = tune.run(train_breast_cancer,
                        metric="binary_error",
                        mode="min",
                        config=config,
                        num_samples=2,
                        scheduler=ASHAScheduler())

    print("Best hyperparameters found were: ", analysis.best_config)
Exemplo n.º 21
0
                dataset=dataset))

    # Run the training
    analysis = tune.run(Training,
                        stop={
                            'training_iteration': args.epochs,
                            'stop_early': True
                        },
                        checkpoint_at_end=True,
                        metric="valid_rmse",
                        mode="min",
                        local_dir=args.logdir,
                        verbose=1,
                        config={
                            "learning_rate":
                            tune.loguniform(args.learning_rate_low,
                                            args.learning_rate_high),
                            "l1":
                            tune.loguniform(args.l1_low, args.l1_high),
                        },
                        num_samples=args.num_samples,
                        resources_per_trial={
                            "cpu": 1,
                            "gpu": 0
                        })

    ray.shutdown()

    # Save args
    with open(os.path.join(args.logdir, "args.pickle"), 'wb') as f:
        pickle.dump(vars(args), f)
Exemplo n.º 22
0
import torch
import mlflow
from ray import tune
from src.scripts.train_wav2vec_kws import Wav2VecKWS
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.loggers import MLFlowLogger
from ray.tune.integration.mlflow import mlflow_mixin
from ray.tune.integration.pytorch_lightning import TuneReportCallback

from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest.ax import AxSearch

config = {
    "w2v_lr": tune.loguniform(1e-5, 1e-1),
    "decoder_lr": tune.loguniform(1e-4, 1e-1),
    "weight_decay": tune.loguniform(1e-6, 1e-3),
    "batch_size": tune.choice([32, 64, 128, 256]),
    "mlflow": {
        "experiment_name": "wav2vec_kws",
        "tracking_uri": "http://192.168.0.32"
    },
}


@mlflow_mixin
def train_model(config, gpus, w2v, num_epochs=10):
    early_stop_callback = EarlyStopping(monitor="val_Accuracy",
                                        min_delta=0.0,
                                        patience=5,
Exemplo n.º 23
0
import pytest

from autogluon.core.hpo.space_converter import RaySpaceConverterFactory
from autogluon.core.space import Space, Categorical, Real, Int, Bool
from ray import tune


@pytest.mark.parametrize('space, expected_space', [
    (Categorical([1, 2]), tune.choice([1, 2])),
    (Real(1, 2, log=True), tune.loguniform(1, 2)),
    (Real(1, 2, log=False), tune.uniform(1, 2)),
    (Int(1, 2), tune.randint(1, 3)),
    (Bool(), tune.randint(0, 2)),
])
def test_space_converter(space, expected_space):
    ray_space = RaySpaceConverterFactory.get_space_converter(
        space.__class__.__name__).convert(space)
    assert type(ray_space) == type(expected_space)
Exemplo n.º 24
0
        agent.epsilon_decay = config["epsilon_decay"]
        agent.eta = config["eta"]
        for i in range(10):
            agent.play()

        agent_test = AgentTest(agent, RandomPlayer(), 0)
        reward = agent_test.play()

    tune.report(steps=agent.steps, reward=reward)


bayesopt = BayesOptSearch(metric="reward", mode="max")
analysis = tune.run(
    training_function,
    config={
        "learning_rate": tune.loguniform(1e-3, 1e-1),
        "alpha": tune.loguniform(1e-3, 1e-1),
        "gamma": tune.loguniform(1e-3, 1e-1),
        "delta": tune.loguniform(0.1, 0.5),
        "epsilon_decay": tune.uniform(0.9999, 0.999999),
        "eta": tune.loguniform(1e-3, 1e-1)
    },
    local_dir="/ray_results/DQN",
    search_alg=bayesopt,
    num_samples=100)

print("Best config: ", analysis.get_best_config(
    metric="reward", mode="max"))

# Get a dataframe for analyzing trial results.
df = analysis.results_df
Exemplo n.º 25
0
def main(name=None,
         num_samples=64,
         gpus_per_trial=1,
         metric="sotl",
         time_budget=None,
         batch_size=100,
         steps_per_epoch=100,
         max_num_epochs=150,
         total_budget_multiplier=10,
         seed=None):
    data_dir = os.path.abspath("../playground/data")
    load_data(data_dir)  # Download data for all trials before starting the run
    if seed is None:
        seed = random.randint(0, 1000)

    config = {
        "lr": tune.loguniform(5e-5, 5),
        "conv1_l2": tune.loguniform(5e-5, 5),
        "conv2_l2": tune.loguniform(5e-5, 5),
        "conv3_l2": tune.loguniform(5e-5, 5),
        "fc1_l2": tune.loguniform(5e-3, 500),
        "lr_reductions": tune.choice([0, 1, 2, 3]),
        "rnorm_scale": tune.loguniform(5e-6, 5),
        "rnorm_power": tune.uniform(0.01, 3),
        "max_num_epochs": max_num_epochs,
        "batch_size": batch_size,
        "steps_per_epoch": steps_per_epoch,
        "data_dir": data_dir,
        "seed": seed,
        "metric": metric,
        "time_budget": time_budget,
        "total_budget_multiplier": total_budget_multiplier
    }
    scheduler = ASHAScheduler(max_t=config["max_num_epochs"],
                              grace_period=1,
                              reduction_factor=4)

    result = tune.run(train_cifar,
                      name=name,
                      resources_per_trial={
                          "cpu": 2,
                          "gpu": gpus_per_trial
                      },
                      config={
                          **config, "wandb": {
                              "project":
                              "SoTL_Cifar",
                              "api_key_file":
                              "~" + os.sep + ".wandb" + os.sep + "nas_key.txt"
                          }
                      },
                      metric=config["metric"],
                      mode="min",
                      num_samples=num_samples,
                      scheduler=scheduler,
                      stop=TotalBudgetStopper(
                          config["max_num_epochs"] *
                          config["total_budget_multiplier"]),
                      loggers=DEFAULT_LOGGERS + (WandbLogger, ),
                      time_budget_s=config["time_budget"])

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(rnorm_scale=best_trial.config["rnorm_scale"],
                             rnorm_power=best_trial.config["rnorm_power"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")

    model_state, optimizer_state = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))

    if os.path.exists("~" + os.sep + ".wandb" + os.sep + "nas_key.txt"):
        f = open("~" + os.sep + ".wandb" + os.sep + "nas_key.txt", "r")
        key = f.read()
        os.environ["WANDB_API_KEY"] = key
Exemplo n.º 26
0
            },
        },
        "goal": "maximize",
        "num_samples": 4,
    },
}

if RAY_AVAILABLE:
    EXPECTED_SEARCH_SPACE = {
        "test_1": {
            "training.learning_rate": tune.uniform(0.001, 0.1),
            "combiner.num_fc_layers": tune.qrandint(3, 6, 3),
            "utterance.cell_type": tune.grid_search(["rnn", "gru", "lstm"]),
        },
        "test_2": {
            "training.learning_rate": tune.loguniform(0.001, 0.1),
            "combiner.num_fc_layers": tune.randint(2, 6),
            "utterance.cell_type": tune.choice(["rnn", "gru", "lstm"]),
        },
    }


@pytest.mark.skipif(not RAY_AVAILABLE,
                    reason="Ray is not installed for testing")
@pytest.mark.parametrize("key", ["test_1", "test_2"])
def test_grid_strategy(key):

    hyperopt_test_params = HYPEROPT_PARAMS[key]
    expected_search_space = EXPECTED_SEARCH_SPACE[key]

    goal = hyperopt_test_params["goal"]
    ck = th.load(checkpoint.best_model_path)
    model.load_state_dict(ck["state_dict"])

    trainer.test(model)


config = {
    "attn_dropout": tune.quniform(0, 1, 0.1),
    "attn_dropout_a": tune.quniform(0, 1, 0.1),
    "attn_dropout_v": tune.quniform(0, 1, 0.1),
    "embed_dropout": tune.quniform(0, 1, 0.1),
    "out_dropout": tune.quniform(0, 1, 0.1),
    "relu_dropout": tune.quniform(0, 1, 0.1),
    "res_dropout": tune.quniform(0, 1, 0.1),
    # "project_dim": tune.choice([40, 50, 60, 70]),
    "lr": tune.loguniform(1e-6, 1e-3),
    "weight_decay": tune.loguniform(1e-10, 1e-2),
}

previous_best = {
    "attn_dropout": 0.3,
    "attn_dropout_a": 0.5,
    "attn_dropout_v": 0.0,
    "embed_dropout": 0.0,
    "out_dropout": 0.2,
    "relu_dropout": 0.5,
    "res_dropout": 0.1,
    "layers": 5,
    "num_heads": 6,
    "head_dim": 14,
    "lr_log": -4,
Exemplo n.º 28
0
https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#tune-scheduler-pbt
"""

"""config - returns a dict of hyperparameters

Selecting different hyperparameters for tuning
    l1 : Number of units in first fully connected layer
    l2 : Number of units in second fully connected layer
    lr : Learning rate
    decay : Decay rate for regularization
    batch_size : Batch size of test and train data
"""
config = {
    "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512
    "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512
    "lr": tune.loguniform(1e-4, 1e-1), # Sampling from log uniform distribution
    "decay": tune.sample_from(lambda _: 10 ** np.random.randint(-7, -3)), # eg. 1e-7, 1e-6, .. 1e-3
    "batch_size": tune.choice([32, 64, 128, 256])
}

# calling trainer
trainer = Trainer(device=device)

"""ASHA (Asynchronous Successive Halving Algorithm) scheduler
        max_t              : Maximum number of units per trail (can be time or epochs)
        grace_period       : Stop trials after specific number of unit if model is not performing well (can be time or epochs)
        reduction_factor   : Set halving rate
"""
scheduler = ASHAScheduler(
    max_t=max_num_epochs,
    grace_period=4,
Exemplo n.º 29
0
if __name__ == '__main__':
    # ===============================================================================
    # Start Process
    # ===============================================================================

    train_config = {
        'data_dir': '/home/congvm/Workspace/evoke/thirdparty/tune/data',
        'num_epochs': 40,
        'num_gpus': 1
    }

    tuning_config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
        "opt": tune.choice(['adam', 'sgd'])
    }

    log_dir = 'experiments'
    experiment_name = 'tune_mnist_asha_' + generate_datetime()

    metric_columns = ["val_loss", "val_accuracy", "training_iteration"]

    start_tuning(tuning_config=tuning_config,
                 train_config=train_config,
                 training_func=train_mnist_tune,
                 report_metric_columns=metric_columns,
                 monitor_metric='val_loss',
                 monitor_mode='min',
Exemplo n.º 30
0
    "petal_length": [
        1.4, 4.7, 6, 1.4, 4.7, 6, 1.4, 4.7, 6, 1.4, 4.7, 6],
    "petal_width":  [
        0.2, 1.4, 2.5, 0.2, 1.4, 2.5, 0.2, 1.4, 2.5, 0.2, 1.4, 2.5],
    "variety": tf.keras.utils.to_categorical([
        0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2])
}

# test_hyperparameter_space = {
#     "lr": tune.sample_from([0.05, 0.01]),
#     "dense_1": tune.sample_from([1, 2]),
#     "dense_2": tune.sample_from([1, 2]),
#     "epochs": tune.sample_from([2, 3]),
#     "batch_size": tune.sample_from([5, 6])
# }

test_hyperparameter_space = {
    "lr": tune.loguniform(0.001, 0.1),
    "dense_1": tune.uniform(2, 128),
    "dense_2": tune.uniform(2, 128),
    "epochs": tune.uniform(1, 5),
    "batch_size": tune.sample_from([16, 32])
}
test_simple_train_hyperparameters = {
    "lr": 0.05,
    "dense_1": 1,
    "dense_2": 1,
    "batch_size": 10,
    "epochs": 1
}