Exemplo n.º 1
0
def main(driver_addresses, host_hash, command):
    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    driver_client = driver_service.DriverClient(driver_addresses, key)
    task_indices = driver_client.task_host_hash_indices(host_hash)
    # Since tasks with the same host hash have shared memory, we will run only
    # one ORTED process on the first task.
    first_task_index = task_indices[0]
    task_addresses = driver_client.all_task_addresses(first_task_index)
    task_client = task_service.TaskClient(first_task_index, task_addresses,
                                          key)
    task_client.run_command(command, os.environ)
Exemplo n.º 2
0
def main(driver_addresses):
    # Die if parent process terminates
    bg = threading.Thread(target=parent_process_monitor, args=(os.getppid(), ))
    bg.daemon = True
    bg.start()

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    driver_client = driver_service.DriverClient(driver_addresses, key)
    task_index = driver_client.task_index_by_rank(rank)
    task_addresses = driver_client.all_task_addresses(task_index)
    task_client = task_service.TaskClient(task_index, task_addresses, key)
    fn, args, kwargs = driver_client.code()
    result = fn(*args, **kwargs)
    task_client.register_code_result(result)
Exemplo n.º 3
0
def main(driver_addresses, host_hash, command):
    if ':' in host_hash:
        raise Exception(
            'Illegal host hash provided. Are you using Open MPI 4.0.0+?')

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    driver_client = driver_service.DriverClient(driver_addresses, key)
    task_indices = driver_client.task_host_hash_indices(host_hash)
    # Since tasks with the same host hash have shared memory, we will run only
    # one ORTED process on the first task.
    first_task_index = task_indices[0]
    task_addresses = driver_client.all_task_addresses(first_task_index)
    task_client = task_service.TaskClient(first_task_index, task_addresses,
                                          key)
    task_client.run_command(command, os.environ)
Exemplo n.º 4
0
# limitations under the License.
# ==============================================================================

import os
import sys

from horovod.spark.task import task_service
from horovod.spark.util import codec, secret
from horovod.spark.driver import driver_service


def main(driver_addresses, host_hash, command):
    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    driver_client = driver_service.DriverClient(driver_addresses, key)
    task_indices = driver_client.task_host_hash_indices(host_hash)
    # Since tasks with the same host hash have shared memory, we will run only
    # one ORTED process on the first task.
    first_task_index = task_indices[0]
    task_addresses = driver_client.all_task_addresses(first_task_index)
    task_client = task_service.TaskClient(first_task_index, task_addresses,
                                          key)
    task_client.run_command(command, os.environ)


if __name__ == '__main__':
    if len(sys.argv) < 4:
        print('Usage: %s <driver addresses> <host hash> <command...>' %
              sys.argv[0])
        sys.exit(1)
    main(codec.loads_base64(sys.argv[1]), sys.argv[2], " ".join(sys.argv[3:]))
Exemplo n.º 5
0
def parent_process_monitor(initial_ppid):
    while True:
        if initial_ppid != os.getppid():
            # Parent process died, terminate
            os._exit(1)
        time.sleep(1)


def main(driver_addresses):
    # Die if parent process terminates
    bg = threading.Thread(target=parent_process_monitor, args=(os.getppid(), ))
    bg.daemon = True
    bg.start()

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    driver_client = driver_service.DriverClient(driver_addresses, key)
    task_index = driver_client.task_index_by_rank(rank)
    task_addresses = driver_client.all_task_addresses(task_index)
    task_client = task_service.TaskClient(task_index, task_addresses, key)
    fn, args, kwargs = driver_client.code()
    result = fn(*args, **kwargs)
    task_client.register_code_result(result)


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print('Usage: %s <driver addresses>' % sys.argv[0])
        sys.exit(1)
    main(codec.loads_base64(sys.argv[1]))