示例#1
0
def _run(sc,
         map_fun,
         run_id,
         local_logdir=False,
         name="no-name",
         evaluator=False):
    """

    Args:
        sc:
        map_fun:
        local_logdir:
        name:

    Returns:

    """
    app_id = str(sc.applicationId)

    num_executions = util.num_executors()

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Make SparkUI intuitive by grouping jobs
    sc.setJobGroup(
        os.environ['ML_ID'],
        "{} | CollectiveAllReduceStrategy - Distributed Training".format(name))

    server = allreduce_reservation.Server(num_executions)
    server_addr = server.start()

    #Force execution on executor, since GPU is located on executor
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, local_logdir, server_addr,
                      evaluator, util.num_executors()))

    logdir = experiment_utils._get_logdir(app_id, run_id)

    print('Finished Experiment \n')

    path_to_return = logdir + '/.outputs.json'
    if pydoop.hdfs.path.exists(path_to_return):
        with pydoop.hdfs.open(path_to_return, "r") as fi:
            contents = fi.read()
            fi.close()
            return logdir, json.loads(contents)

    return logdir, None
示例#2
0
  def __init__(self, count):
    """

    Args:
        count:
    """
    assert count > 0
    self.reservations = Reservations(count)
    self.worker_finished = WorkerFinished(util.num_executors() - util.num_param_servers())
def _launch(sc, map_fun, local_logdir=False, name="no-name"):
    """

    Args:
        sc:
        map_fun:
        local_logdir:
        name:

    Returns:

    """
    global run_id
    app_id = str(sc.applicationId)

    num_executions = util.num_executors()

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Make SparkUI intuitive by grouping jobs
    sc.setJobGroup("ParameterServerStrategy",
                   "{} | Distributed Training".format(name))

    server = parameter_server_reservation.Server(num_executions)
    server_addr = server.start()

    num_ps = util.num_param_servers()

    #Force execution on executor, since GPU is located on executor
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, local_logdir, server_addr,
                      num_ps))

    logdir = _get_logdir(app_id)

    path_to_metric = logdir + '/metric'
    if pydoop.hdfs.path.exists(path_to_metric):
        with pydoop.hdfs.open(path_to_metric, "r") as fi:
            metric = float(fi.read())
            fi.close()
            return metric, logdir

    print('Finished Experiment \n')

    return None, logdir
示例#4
0
def mirrored(train_fn,
             name='no-name',
             local_logdir=False,
             description=None,
             evaluator=False,
             metric_key=None):
    """
    *Distributed Training*

    Example usage:

    >>> from hops import experiment
    >>> def mirrored_training():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the train_fn function
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...MirroredStrategy()...
    >>> experiment.mirrored(mirrored_training, local_logdir=True)

    Args:
        :train_fn: contains the code where you are using MirroredStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :evaluator: whether to run one of the workers as an evaluator
        :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment

    Returns:
        HDFS path in your project where the experiment is stored and return value from the process running as chief

    """

    num_ps = util.num_param_servers()
    assert num_ps == 0, "number of parameter servers should be 0"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    num_workers = util.num_executors()
    if evaluator:
        assert num_workers > 2, "number of workers must be atleast 3 if evaluator is set to True"

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        experiment_utils._create_experiment_dir(app_id, run_id)

        experiment_json = experiment_utils._populate_experiment(
            name, 'mirrored', 'DISTRIBUTED_TRAINING', None, description,
            app_id, None, None)

        experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = mirrored_impl._run(sc,
                                                 train_fn,
                                                 run_id,
                                                 local_logdir=local_logdir,
                                                 name=name,
                                                 evaluator=evaluator)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        metric = experiment_utils._get_metric(return_dict, metric_key)

        experiment_utils._finalize_experiment(experiment_json, metric, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)

        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
示例#5
0
def parameter_server(map_fun,
                     name='no-name',
                     local_logdir=False,
                     description=None,
                     evaluator=False):
    """
    *Distributed Training*

    Sets up the cluster to run ParameterServerStrategy.

    TF_CONFIG is exported in the background and does not need to be set by the user themselves.

    Example usage:

    >>> from hops import experiment
    >>> def distributed_training():
    >>>    # Do all imports in the function
    >>>    import tensorflow
    >>>    # Put all code inside the wrapper function
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...ParameterServerStrategy(num_gpus_per_worker=devices.get_num_gpus())...
    >>> experiment.parameter_server(distributed_training, local_logdir=True)

    Args:f
        :map_fun: contains the code where you are using ParameterServerStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :description: a longer description for the experiment
        :evaluator: whether to run one of the workers as an evaluator

    Returns:
        HDFS path in your project where the experiment is stored and return value from the process running as chief

    """
    num_ps = util.num_param_servers()
    num_executors = util.num_executors()

    assert num_ps > 0, "number of parameter servers should be greater than 0"
    assert num_ps < num_executors, "num_ps cannot be greater than num_executors (i.e. num_executors == num_ps + num_workers)"
    if evaluator:
        assert num_executors - num_ps > 2, "number of workers must be atleast 3 if evaluator is set to True"

    global running
    if running:
        raise RuntimeError("An experiment is currently running.")

    start = time.time()
    sc = util._find_spark().sparkContext
    try:
        global app_id
        global experiment_json
        global run_id
        app_id = str(sc.applicationId)

        _start_run()

        hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id))

        experiment_json = experiment_utils._populate_experiment(
            name, 'parameter_server', 'DISTRIBUTED_TRAINING', None,
            description, app_id, None, None)

        experiment_json = experiment_json = experiment_utils._attach_experiment_xattr(
            app_id, run_id, experiment_json, 'CREATE')

        logdir, return_dict = ps_impl._run(sc,
                                           map_fun,
                                           run_id,
                                           local_logdir=local_logdir,
                                           name=name,
                                           evaluator=evaluator)
        duration = experiment_utils._seconds_to_milliseconds(time.time() -
                                                             start)

        experiment_utils._finalize_experiment(experiment_json, None, app_id,
                                              run_id, 'FINISHED', duration,
                                              logdir, None, None)

        return logdir, return_dict
    except:
        _exception_handler(
            experiment_utils._seconds_to_milliseconds(time.time() - start))
        raise
    finally:
        _end_run(sc)
示例#6
0
def parameter_server(map_fun, name='no-name', local_logdir=False, versioned_resources=None, description=None):
    """
    *Distributed Training*

    Sets up the cluster to run ParameterServerStrategy.

    TF_CONFIG is exported in the background and does not need to be set by the user themselves.

    Example usage:

    >>> from hops import experiment
    >>> def distributed_training():
    >>>    import tensorflow
    >>>    from hops import tensorboard
    >>>    from hops import devices
    >>>    logdir = tensorboard.logdir()
    >>>    ...ParameterServerStrategy(num_gpus_per_worker=devices.get_num_gpus())...
    >>> experiment.parameter_server(distributed_training, local_logdir=True)

    Args:
        :map_fun: contains the code where you are using ParameterServerStrategy.
        :name: name of the experiment
        :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS
        :versioned_resources: A list of HDFS paths of resources to version with this experiment
        :description: a longer description for the experiment

    Returns:
        HDFS path in your project where the experiment is stored

    """
    num_ps = util.num_param_servers()
    num_executors = util.num_executors()

    assert num_ps > 0, "number of parameter servers should be greater than 0"
    assert num_ps < num_executors, "num_ps cannot be greater than num_executors (i.e. num_executors == num_ps + num_workers)"

    global running
    if running:
        raise RuntimeError("An experiment is currently running. Please call experiment.end() to stop it.")

    try:
        global app_id
        global experiment_json
        global elastic_id
        running = True

        sc = util._find_spark().sparkContext
        app_id = str(sc.applicationId)

        ps.run_id = ps.run_id + 1

        versioned_path = util._version_resources(versioned_resources, ps._get_logdir(app_id))

        experiment_json = util._populate_experiment(sc, name, 'experiment', 'parameter_server', ps._get_logdir(app_id), None, versioned_path, description)

        util._version_resources(versioned_resources, ps._get_logdir(app_id))

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)

        retval, logdir = ps._launch(sc, map_fun, local_logdir=local_logdir, name=name)

        experiment_json = util._finalize_experiment(experiment_json, None, retval)

        util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json)
    except:
        _exception_handler()
        raise
    finally:
        #cleanup spark jobs
        elastic_id +=1
        running = False
        sc.setJobGroup("", "")

    return logdir
def unit2():
    from pyspark.context import SparkContext
    from pyspark.conf import SparkConf

    import argparse
    import os
    import numpy
    import sys
    import tensorflow as tf
    import threading
    from datetime import datetime
    from hops import util
    from hops import hdfs

    from tensorflowonspark import TFCluster

    sc = spark.sparkContext
    num_executors = util.num_executors(spark)
    num_ps = util.num_param_servers(spark)

    parser = argparse.ArgumentParser()
    parser.add_argument("-e",
                        "--epochs",
                        help="number of epochs",
                        type=int,
                        default=0)
    parser.add_argument("-f",
                        "--format",
                        help="example format: (csv|pickle|tfr)",
                        choices=["csv", "pickle", "tfr"],
                        default="csv")
    parser.add_argument(
        "-i",
        "--images",
        help="HDFS path to MNIST images in parallelized format",
        default='/Projects/' + hdfs.project_name() + '/mnist/train/images')
    parser.add_argument(
        "-l",
        "--labels",
        help="HDFS path to MNIST labels in parallelized format",
        default='/Projects/' + hdfs.project_name() + '/mnist/train/labels')
    parser.add_argument("-m",
                        "--model",
                        help="HDFS path to save/load model during train/test",
                        default="mnist_model")
    parser.add_argument(
        "-n",
        "--cluster_size",
        help="number of nodes in the cluster (for Spark Standalone)",
        type=int,
        default=num_executors)
    parser.add_argument("-o",
                        "--output",
                        help="HDFS path to save test/inference output",
                        default="predictions")
    parser.add_argument("-r",
                        "--readers",
                        help="number of reader/enqueue threads",
                        type=int,
                        default=1)
    parser.add_argument("-s",
                        "--steps",
                        help="maximum number of steps",
                        type=int,
                        default=1000)
    parser.add_argument("-tb",
                        "--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    parser.add_argument("-X",
                        "--mode",
                        help="train|inference",
                        default="train")
    parser.add_argument("-c",
                        "--rdma",
                        help="use rdma connection",
                        default=False)
    args = parser.parse_args()
    print("args:", args)

    print("{0} ===== Start".format(datetime.now().isoformat()))

    cluster = TFCluster.run(sc, mnist_fun, args, args.cluster_size, num_ps,
                            args.tensorboard, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()

    print("{0} ===== Stop".format(datetime.now().isoformat()))