Exemplo n.º 1
0
def main():
    fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE)
    if not fs.exists(WINE_EQUALITY_FILE):
        raise Exception(f"{WINE_EQUALITY_FILE} not found")

    run_on_yarn(experiment_fn,
                task_specs={
                    "chief":
                    TaskSpec(memory="2 GiB", vcores=4),
                    "worker":
                    TaskSpec(memory="2 GiB",
                             vcores=4,
                             instances=(HVD_SIZE - 1)),
                    "evaluator":
                    TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                    os.path.basename(__file__): __file__,
                },
                custom_task_module="tf_yarn.tensorflow.tasks.gloo_allred_task")
Exemplo n.º 2
0
def main():
    fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE)
    if not fs.exists(WINE_EQUALITY_FILE):
        raise Exception(f"{WINE_EQUALITY_FILE} not found")

    # forcing call to model_to_estimator._save_first_checkpoint l457
    # https://github.com/tensorflow/estimator/blob/ \
    # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py
    # otherwise there is a race condition
    # when all workers try to save the first checkpoint at the same time
    experiment_fn()

    run_on_yarn(experiment_fn,
                task_specs={
                    "chief": TaskSpec(memory="2 GiB", vcores=4),
                    "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4),
                    "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2),
                    "evaluator": TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                    os.path.basename(__file__): __file__
                })
Exemplo n.º 3
0
    experiment_name = "tf-yarn-tests"
    exp = mlflow.get_experiment_by_name(experiment_name)
    if not exp:
        experiment_id = mlflow.create_experiment(
            experiment_name,
            f"{_get_fs_for_tests()}/user/{USER}/mlflow_artifacts")
    else:
        experiment_id = exp.experiment_id

    run_id = mlflow.start_run(experiment_id=experiment_id).info.run_id

    run_on_yarn(experiment_fn,
                task_specs={
                    "chief": TaskSpec(memory="1 GiB", vcores=1),
                    "evaluator": TaskSpec(memory="1 GiB", vcores=1)
                },
                files={
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                })

    mlflow.end_run()

    # check if run has been registered in MLFlow
    run_json = requests.get(
        f"{mlflow.get_tracking_uri()}/api/2.0/mlflow/runs/get",
        params={
            'run_id': run_id
        }).json()

    logger.info(f"created run: {run_json}")
Exemplo n.º 4
0
    return Experiment(
        estimator,
        tf.estimator.TrainSpec(
            train_input_fn,
            max_steps=10,
            hooks=[hvd.BroadcastGlobalVariablesHook(0)]
        ),
        tf.estimator.EvalSpec(
            eval_input_fn,
            steps=10,
            start_delay_secs=0,
            throttle_secs=30
        )
    )


if __name__ == "__main__":
    run_on_yarn(
        experiment_fn,
        task_specs={
            "chief": TaskSpec(memory="2 GiB", vcores=4),
            "worker": TaskSpec(memory="2 GiB", vcores=4, instances=1),
            "evaluator": TaskSpec(memory="2 GiB", vcores=1),
            "tensorboard": TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR)
        },
        files={
            os.path.basename(winequality.__file__): winequality.__file__,
        },
        custom_task_module="tf_yarn.tensorflow.tasks.gloo_allred_task"
    )
Exemplo n.º 5
0
    estimator = tf.estimator.LinearClassifier(
        feature_columns=winequality.get_feature_columns(),
        model_dir=HDFS_DIR,
        n_classes=winequality.get_n_classes())
    return Experiment(
        estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=100),
        tf.estimator.EvalSpec(eval_input_fn,
                              steps=10,
                              start_delay_secs=0,
                              throttle_secs=30))


if __name__ == "__main__":
    fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE)
    if not fs.exists(WINE_EQUALITY_FILE):
        raise Exception(f"{WINE_EQUALITY_FILE} not found")

    run_on_yarn(experiment_fn,
                task_specs={
                    "chief":
                    TaskSpec(memory="2 GiB", vcores=4),
                    "evaluator":
                    TaskSpec(memory="2 GiB", vcores=1),
                    "tensorboard":
                    TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR)
                },
                files={
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                })
Exemplo n.º 6
0
    loss = tf.compat.v1.losses.mean_squared_error(x, labels)
    train_op = tf.compat.v1.assign_add(tf.compat.v1.train.get_global_step(), 1)
    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss,
                                      train_op=train_op,
                                      predictions={"x": x},
                                      eval_metric_ops={})


def experiment_fn() -> Experiment:
    # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15
    import tensorflow as tf

    def input_fn():
        x = tf.constant([[1.0], [2.0], [3.0], [4.0]])
        return {"x": x}, x

    estimator = tf.estimator.Estimator(model_fn=model_fn)
    train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1)
    eval_spec = tf.estimator.EvalSpec(input_fn, steps=1)
    return Experiment(estimator, train_spec, eval_spec)


if __name__ == "__main__":
    # skein.Client is useful when multiple learnings run in parallel
    # and share one single skein JAVA process
    with skein.Client() as client:
        run_on_yarn(experiment_fn,
                    task_specs={"chief": TaskSpec(memory="1 GiB", vcores=1)},
                    skein_client=client)