예제 #1
0
def test_mlflow(ray_start_4_cpus, tmp_path):
    config = TestConfig()

    params = {"p1": "p1"}

    temp_dir = tmp_path
    num_workers = 4

    def train_func(config):
        train.report(episode_reward_mean=4)
        train.report(episode_reward_mean=5)
        train.report(episode_reward_mean=6)
        return 1

    callback = MLflowLoggerCallback(experiment_name="test_exp",
                                    logdir=temp_dir)
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    trainer.run(train_func, config=params, callbacks=[callback])

    from mlflow.tracking import MlflowClient

    client = MlflowClient(
        tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri())

    experiment_id = client.get_experiment_by_name("test_exp").experiment_id
    all_runs = callback.mlflow_util._mlflow.search_runs(
        experiment_ids=[experiment_id])
    assert len(all_runs) == 1
    # all_runs is a pandas dataframe.
    all_runs = all_runs.to_dict(orient="records")
    run_id = all_runs[0]["run_id"]
    run = client.get_run(run_id)

    assert run.data.params == params
    assert ("episode_reward_mean" in run.data.metrics
            and run.data.metrics["episode_reward_mean"] == 6.0)
    assert (TRAINING_ITERATION in run.data.metrics
            and run.data.metrics[TRAINING_ITERATION] == 3.0)

    metric_history = client.get_metric_history(run_id=run_id,
                                               key="episode_reward_mean")

    assert len(metric_history) == 3
    iterations = [metric.step for metric in metric_history]
    assert iterations == [1, 2, 3]
    rewards = [metric.value for metric in metric_history]
    assert rewards == [4, 5, 6]
def main(num_workers=2, use_gpu=False):
    trainer = Trainer(
        backend="torch", num_workers=num_workers, use_gpu=use_gpu)
    trainer.start()
    final_results = trainer.run(
        train_func=train_func,
        config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": 4
        },
        callbacks=[
            MLflowLoggerCallback(experiment_name="train_fashion_mnist")
        ])

    print("Full losses for rank 0 worker: ", final_results)
예제 #3
0
        "num_layers": NUM_LAYERS,
        "dropout_every": DROPOUT_EVERY,
        "dropout_prob": DROPOUT_PROB,
        "num_features": num_features,
    }

    # Create 2 callbacks: one for Tensorboard Logging and one for MLflow
    # logging. Pass these into Trainer, and all results that are
    # reported by ``train.report()`` will be logged to these 2 places.
    # TODO: TBXLoggerCallback should create nonexistent logdir
    #       and should also create 1 directory per file.
    tbx_logdir = "./runs"
    os.makedirs(tbx_logdir, exist_ok=True)
    callbacks = [
        TBXLoggerCallback(logdir=tbx_logdir),
        MLflowLoggerCallback(experiment_name="cuj-big-data-training",
                             save_artifact=True),
    ]

    # Remove CPU resource so Datasets can be scheduled.
    resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None

    trainer = Trainer(
        backend="torch",
        num_workers=num_workers,
        use_gpu=use_gpu,
        resources_per_worker=resources_per_worker,
    )
    trainer.start()
    results = trainer.run(train_func=train_func,
                          config=config,
                          callbacks=callbacks,