Пример #1
0
def main():
    parser = argparse.ArgumentParser(description='Launch a distributed job')
    parser.add_argument('-n',
                        '--num-workers',
                        required=True,
                        type=int,
                        help='number of worker nodes to be launched')
    parser.add_argument('-s',
                        '--num-servers',
                        type=int,
                        help='number of server nodes to be launched, \
                        in default it is equal to NUM_WORKERS')
    parser.add_argument('-H',
                        '--hostfile',
                        type=str,
                        help='the hostfile of slave machines which will run \
                        the job. Required for ssh and mpi launcher')
    parser.add_argument('--sync-dst-dir',
                        type=str,
                        help='if specificed, it will sync the current \
                        directory into slave machines\'s SYNC_DST_DIR if ssh \
                        launcher is used')
    parser.add_argument('--launcher',
                        type=str,
                        default='ssh',
                        choices=['local', 'ssh', 'mpi', 'sge', 'yarn'],
                        help='the launcher to use')
    parser.add_argument('command',
                        nargs='+',
                        help='command for launching the program')
    args, unknown = parser.parse_known_args()
    args.command += unknown
    if args.num_servers is None:
        args.num_servers = args.num_workers

    args = dmlc_opts(args)

    if args.host_file is None or args.host_file == 'None':
        if args.cluster == 'yarn':
            from dmlc_tracker import yarn
            yarn.submit(args)
        elif args.cluster == 'local':
            from dmlc_tracker import local
            local.submit(args)
        elif args.cluster == 'sge':
            from dmlc_tracker import sge
            sge.submit(args)
        else:
            raise RuntimeError('Unknown submission cluster type %s' %
                               args.cluster)
    else:
        if args.cluster == 'ssh':
            from dmlc_tracker import ssh
            ssh.submit(args)
        elif args.cluster == 'mpi':
            from dmlc_tracker import mpi
            mpi.submit(args)
        else:
            raise RuntimeError('Unknown submission cluster type %s' %
                               args.cluster)
Пример #2
0
def main():
    parser = argparse.ArgumentParser(description='Launch a distributed job')
    parser.add_argument('-n', '--num-workers', required=True, type=int,
                        help = 'number of worker nodes to be launched')
    parser.add_argument('-s', '--num-servers', type=int,
                        help = 'number of server nodes to be launched, \
                        in default it is equal to NUM_WORKERS')
    parser.add_argument('-H', '--hostfile', type=str,
                        help = 'the hostfile of slave machines which will run \
                        the job. Required for ssh and mpi launcher')
    parser.add_argument('--sync-dst-dir', type=str,
                        help = 'if specificed, it will sync the current \
                        directory into slave machines\'s SYNC_DST_DIR if ssh \
                        launcher is used')
    parser.add_argument('--launcher', type=str, default='ssh',
                        choices = ['local', 'ssh', 'mpi', 'sge', 'yarn'],
                        help = 'the launcher to use')
    parser.add_argument('command', nargs='+',
                        help = 'command for launching the program')
    args, unknown = parser.parse_known_args()
    args.command += unknown
    if args.num_servers is None:
        args.num_servers = args.num_workers

    args = dmlc_opts(args)
    
    if args.host_file is None or args.host_file == 'None':
      if args.cluster == 'yarn':
          from dmlc_tracker import yarn
          yarn.submit(args)
      elif args.cluster == 'local':
          from dmlc_tracker import local
          local.submit(args)
      elif args.cluster == 'sge':
          from dmlc_tracker import sge
          sge.submit(args)
      else:
          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
    else:
      if args.cluster == 'ssh':
          from dmlc_tracker import ssh
          ssh.submit(args)
      elif args.cluster == 'mpi':
          from dmlc_tracker import mpi
          mpi.submit(args)
      else:
          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
Пример #3
0
def main():
    parser = argparse.ArgumentParser(description='Launch a distributed job')
    parser.add_argument('-n',
                        '--num-workers',
                        required=True,
                        type=int,
                        help='number of worker nodes to be launched')
    parser.add_argument('-s',
                        '--num-servers',
                        type=int,
                        help='number of server nodes to be launched, \
                        in default it is equal to NUM_WORKERS')
    parser.add_argument('-H',
                        '--hostfile',
                        type=str,
                        help='the hostfile of slave machines which will run \
                        the job. Required for ssh and mpi launcher.\
                        When -SH is set, the file provided by -H will \
                        be used to recognize worker machines only. Otherwise, \
                        -H is used for both server and worker machines.')
    parser.add_argument('-SH',
                        '--server-hostfile',
                        type=str,
                        help='the hostfile of server machines which will run \
                        the job. Required for byteps multi-machine launching.')
    parser.add_argument('--sync-dst-dir',
                        type=str,
                        help='if specificed, it will sync the current \
                        directory into slave machines\'s SYNC_DST_DIR if ssh \
                        launcher is used')
    parser.add_argument('--launcher',
                        type=str,
                        default='ssh',
                        choices=['local', 'ssh', 'mpi', 'sge', 'yarn'],
                        help='the launcher to use')
    bps_group = parser.add_argument_group('byteps-backend')
    bps_group.add_argument('--byteps',
                           action='store_true',
                           help='Whether use byteps launcher to launch')
    parser.add_argument(
        '--env-server',
        action='append',
        default=[],
        help='Given a pair of environment_variable:value, sets this value of \
                        environment variable for the server processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument(
        '--env-worker',
        action='append',
        default=[],
        help='Given a pair of environment_variable:value, sets this value of \
                        environment variable for the worker processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument('--env',
                        action='append',
                        default=[],
                        help='given a environment variable, passes their \
                        values from current system to all workers and servers. \
                        Not necessary when launcher is local as in that case \
                        all environment variables which are set are copied.')
    parser.add_argument('--p3',
                        action='store_true',
                        default=False,
                        help='Use P3 distributed training')
    parser.add_argument('command',
                        nargs='+',
                        help='command for launching the program')
    args, unknown = parser.parse_known_args()
    args.command += unknown

    if args.byteps:
        import byteps_launcher as bpsl
        bpsl.submit(args)
        return

    if args.num_servers is None:
        args.num_servers = args.num_workers
    if args.p3:
        args.command = ['DMLC_PS_VAN_TYPE=p3 DMLC_PS_WATER_MARK=10'
                        ] + args.command

    args = dmlc_opts(args)

    if args.host_file is None or args.host_file == 'None':
        if args.cluster == 'yarn':
            from dmlc_tracker import yarn
            yarn.submit(args)
        elif args.cluster == 'local':
            from dmlc_tracker import local
            local.submit(args)
        elif args.cluster == 'sge':
            from dmlc_tracker import sge
            sge.submit(args)
        else:
            raise RuntimeError('Unknown submission cluster type %s' %
                               args.cluster)
    else:
        if args.cluster == 'ssh':
            from dmlc_tracker import ssh
            ssh.submit(args)
        elif args.cluster == 'mpi':
            from dmlc_tracker import mpi
            mpi.submit(args)
        else:
            raise RuntimeError('Unknown submission cluster type %s' %
                               args.cluster)
Пример #4
0
def main():
    parser = argparse.ArgumentParser(description='Launch a distributed job')
    parser.add_argument('-n', '--num-workers', required=True, type=int,
                        help = 'number of worker nodes to be launched')
    parser.add_argument('-s', '--num-servers', type=int,
                        help = 'number of server nodes to be launched, \
                        in default it is equal to NUM_WORKERS')
    parser.add_argument('-H', '--hostfile', type=str,
                        help = 'the hostfile of slave machines which will run \
                        the job. Required for ssh and mpi launcher')
    parser.add_argument('--sync-dst-dir', type=str,
                        help = 'if specificed, it will sync the current \
                        directory into slave machines\'s SYNC_DST_DIR if ssh \
                        launcher is used')
    parser.add_argument('--launcher', type=str, default='ssh',
                        choices = ['local', 'ssh', 'mpi', 'sge', 'yarn'],
                        help = 'the launcher to use')
    parser.add_argument('--env-server', action='append', default=[],
                        help = 'Given a pair of environment_variable:value, sets this value of \
                        environment variable for the server processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument('--env-worker', action='append', default=[],
                        help = 'Given a pair of environment_variable:value, sets this value of \
                        environment variable for the worker processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument('--env', action='append', default=[],
                        help = 'given a environment variable, passes their \
                        values from current system to all workers and servers. \
                        Not necessary when launcher is local as in that case \
                        all environment variables which are set are copied.')
    parser.add_argument('command', nargs='+',
                        help = 'command for launching the program')
    args, unknown = parser.parse_known_args()
    args.command += unknown
    if args.num_servers is None:
        args.num_servers = args.num_workers

    args = dmlc_opts(args)

    if args.host_file is None or args.host_file == 'None':
      if args.cluster == 'yarn':
          from dmlc_tracker import yarn
          yarn.submit(args)
      elif args.cluster == 'local':
          from dmlc_tracker import local
          local.submit(args)
      elif args.cluster == 'sge':
          from dmlc_tracker import sge
          sge.submit(args)
      else:
          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
    else:
      if args.cluster == 'ssh':
          from dmlc_tracker import ssh
          ssh.submit(args)
      elif args.cluster == 'mpi':
          from dmlc_tracker import mpi
          mpi.submit(args)
      else:
          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
Пример #5
0
def main():
    parser = argparse.ArgumentParser(description='Launch a distributed job')
    parser.add_argument('-n',
                        '--num-workers',
                        required=True,
                        type=int,
                        help='number of worker nodes to be launched')
    parser.add_argument('-s',
                        '--num-servers',
                        type=int,
                        help='number of server nodes to be launched, \
                        in default it is equal to NUM_WORKERS')
    parser.add_argument('-H',
                        '--hostfile',
                        type=str,
                        help='the hostfile of slave machines which will run \
                        the job. Required for ssh and mpi launcher')
    parser.add_argument('--sync-dst-dir',
                        type=str,
                        help='if specificed, it will sync the current \
                        directory into slave machines\'s SYNC_DST_DIR if ssh \
                        launcher is used')
    parser.add_argument('--launcher',
                        type=str,
                        default='ssh',
                        choices=['local', 'ssh', 'mpi', 'sge', 'yarn'],
                        help='the launcher to use')
    parser.add_argument(
        '--env-server',
        action='append',
        default=[],
        help='Given a pair of environment_variable:value, sets this value of \
                        environment variable for the server processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument(
        '--env-worker',
        action='append',
        default=[],
        help='Given a pair of environment_variable:value, sets this value of \
                        environment variable for the worker processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument('--env',
                        action='append',
                        default=[],
                        help='given a environment variable, passes their \
                        values from current system to all workers and servers. \
                        Not necessary when launcher is local as in that case \
                        all environment variables which are set are copied.')
    parser.add_argument('command',
                        nargs='+',
                        help='command for launching the program')
    args, unknown = parser.parse_known_args()
    args.command += unknown
    if args.num_servers is None:
        args.num_servers = args.num_workers

    args = dmlc_opts(args)

    if args.host_file is None or args.host_file == 'None':
        if args.cluster == 'yarn':
            from dmlc_tracker import yarn
            yarn.submit(args)
        elif args.cluster == 'local':
            from dmlc_tracker import local
            local.submit(args)
        elif args.cluster == 'sge':
            from dmlc_tracker import sge
            sge.submit(args)
        else:
            raise RuntimeError('Unknown submission cluster type %s' %
                               args.cluster)
    else:
        if args.cluster == 'ssh':
            from dmlc_tracker import ssh
            ssh.submit(args)
        elif args.cluster == 'mpi':
            from dmlc_tracker import mpi
            mpi.submit(args)
        else:
            raise RuntimeError('Unknown submission cluster type %s' %
                               args.cluster)
Пример #6
0
def main():
    parser = argparse.ArgumentParser(description='Launch a distributed job')
    parser.add_argument('-n',
                        '--num-workers',
                        required=True,
                        type=int,
                        help='number of worker nodes to be launched')
    parser.add_argument('-s',
                        '--num-servers',
                        type=int,
                        help='number of server nodes to be launched, \
                        in default it is equal to NUM_WORKERS')
    parser.add_argument('-H',
                        '--hostfile',
                        type=str,
                        help='the hostfile of slave machines which will run \
                        the job. Required for ssh and mpi launcher')
    parser.add_argument('--sync-dst-dir',
                        type=str,
                        help='if specificed, it will sync the current \
                        directory into slave machines\'s SYNC_DST_DIR if ssh \
                        launcher is used')
    parser.add_argument('--launcher',
                        type=str,
                        default='ssh',
                        choices=['local', 'ssh', 'mpi', 'sge', 'yarn'],
                        help='the launcher to use')
    parser.add_argument(
        '--env-server',
        action='append',
        default=[],
        help='Given a pair of environment_variable:value, sets this value of \
                        environment variable for the server processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument(
        '--env-worker',
        action='append',
        default=[],
        help='Given a pair of environment_variable:value, sets this value of \
                        environment variable for the worker processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument('--env',
                        action='append',
                        default=[],
                        help='given a environment variable, passes their \
                        values from current system to all workers and servers. \
                        Not necessary when launcher is local as in that case \
                        all environment variables which are set are copied.')
    parser.add_argument(
        '--elastic-training-enabled',
        type=bool,
        default=False,
        help=' if this option is set to true, elastic training is enabled. \
                        If True, you should specify which instance pool to use by using option \
                        --instance-pool')
    parser.add_argument('--instance-pool', type=str, default='DEFAULT', help=' You can use '
                        ' [reservedInstancePoolId | \'spotInstance\', | \'DEFAULT\']' \
                        'In case of DEFAULT a file will be created in same folder '
                        ' where --hostfile lives. The default worker filename will be \'default_worker_file\'')
    parser.add_argument('--max-elastic-instances', type=int, default=0,help = ' if instance pool is reserved' \
                        ' or spotInstance, up to max-elastic-instances can be added to existing cluster')
    parser.add_argument('--launch-worker', type=bool, default=False, help = 'whether this script should' \
                        'only launch worker instances')
    parser.add_argument('--host',
                        type=str,
                        help='host name or ip of new worker host to launch')
    parser.add_argument(
        '--port',
        type=str,
        default='22',
        help='port number of new worker for ssh command to run by')
    parser.add_argument('command',
                        nargs='+',
                        help='command for launching the program')
    # TODO verify if elastic training enabled is true
    # verify that --instance-pool is defined ,
    # if --instance-pool is [reserved|spot], verify that --max-elastic-instances is defined
    # if --instance-pool is DEFAULT then , max_elastic_instance is not defined
    # launch-worker is true, verify we have host

    args, unknown = parser.parse_known_args()
    #  if args.hostfile is not None:

    args.command += unknown

    logging.info("BEGING %s", args)

    if args.num_servers is None:
        args.num_servers = args.num_workers

    args = dmlc_opts(args)

    logging.info("JAHHAHA%s", args)
    if args.host_file is None or args.host_file == 'None':
        if args.cluster == 'yarn':
            from dmlc_tracker import yarn
            yarn.submit(args)
        elif args.cluster == 'local':
            from dmlc_tracker import local
            local.submit(args)
        elif args.cluster == 'sge':
            from dmlc_tracker import sge
            sge.submit(args)
        elif args.cluster == 'ssh' and args.launch_worker is True:
            from dmlc_tracker import ssh
            logging.info("Vikas dmlc_tracker ssh %s", args)
            ssh.submit(args)
        else:
            raise RuntimeError('Unknown submission cluster type %s' %
                               args.cluster)
    else:
        if args.cluster == 'ssh':
            from dmlc_tracker import ssh
            logging.info("Vikas dmlc_tracker ssh %s", args)
            ssh.submit(args)
        elif args.cluster == 'mpi':
            from dmlc_tracker import mpi
            mpi.submit(args)
        else:
            raise RuntimeError('Unknown submission cluster type %s' %
                               args.cluster)
Пример #7
0
def main():
    parser = argparse.ArgumentParser(description='Launch a distributed job')
    parser.add_argument('-n', '--num-workers', required=True, type=int,
                        help = 'number of worker nodes to be launched')
    parser.add_argument('-s', '--num-servers', type=int,
                        help = 'number of server nodes to be launched, \
                        in default it is equal to NUM_WORKERS')
    parser.add_argument('-H', '--hostfile', type=str,
                        help = 'the hostfile of slave machines which will run \
                        the job. Required for ssh and mpi launcher')
    parser.add_argument('--sync-dst-dir', type=str,
                        help = 'if specificed, it will sync the current \
                        directory into slave machines\'s SYNC_DST_DIR if ssh \
                        launcher is used')
    parser.add_argument('--launcher', type=str, default='ssh',
                        choices = ['local', 'ssh', 'mpi', 'sge', 'yarn'],
                        help = 'the launcher to use')
    parser.add_argument('--env-server', action='append', default=[],
                        help = 'Given a pair of environment_variable:value, sets this value of \
                        environment variable for the server processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument('--env-worker', action='append', default=[],
                        help = 'Given a pair of environment_variable:value, sets this value of \
                        environment variable for the worker processes. This overrides values of \
                        those environment variable on the machine where this script is run from. \
                        Example OMP_NUM_THREADS:3')
    parser.add_argument('--env', action='append', default=[],
                        help = 'given a environment variable, passes their \
                        values from current system to all workers and servers. \
                        Not necessary when launcher is local as in that case \
                        all environment variables which are set are copied.')
    parser.add_argument('--elastic-training-enabled', type=bool, default=False,
                        help = ' if this option is set to true, elastic training is enabled. \
                        If True, you should specify which instance pool to use by using option \
                        --instance-pool')
    parser.add_argument('--launch-worker', type=bool, default=False, help = 'whether this script should' \
                        'only launch worker instances')    
    parser.add_argument('--host', type=str, help='host name or ip of new worker host to launch')
    parser.add_argument('--port', type=str, default='22', help='port number of new worker for ssh command to run by')           
    parser.add_argument('command', nargs='+',
                        help = 'command for launching the program')

    args, unknown = parser.parse_known_args()

    args.command += unknown
    
    logging.info("BEGIN args %s", args)

    if args.num_servers is None:
        args.num_servers = args.num_workers

    args = dmlc_opts(args)
    
    logging.info("args after dmlc_opts %s", args)

    if os.getenv('WORKER_LAUNCH_TEMPLATE_ID') is not None and os.getenv('ELASTIC_WORKER_TAG') is not None and args.launch_worker is False :
        logging.info("Found launch template id and elastic worker tag in environment variable. Will start ET Management thread")
        thread = Thread(target = manage_elastic_instance, args=(args.worker_host_file, args.num_workers))
        thread.setDaemon(True)
        thread.start()

    if args.host_file is None or args.host_file == 'None':
      if args.cluster == 'yarn':
          from dmlc_tracker import yarn
          yarn.submit(args)
      elif args.cluster == 'local':
          from dmlc_tracker import local
          local.submit(args)
      elif args.cluster == 'sge':
          from dmlc_tracker import sge
          sge.submit(args)
      elif args.cluster == 'ssh' and args.launch_worker is True:
          from dmlc_tracker import ssh
          logging.info("dmlc_tracker ssh %s", args)
          ssh.submit(args)
      else:
          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
    else:
      if args.cluster == 'ssh':
          from dmlc_tracker import ssh
          logging.info("dmlc_tracker ssh %s", args)
          ssh.submit(args)
      elif args.cluster == 'mpi':
          from dmlc_tracker import mpi
          mpi.submit(args)
      else:
          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)