def submit(args): """Submit function of local jobs.""" def mthread_submit(nworker, nserver, envs): """ customized submit script, that submit nslave jobs, each must contain args as parameter note this can be a lambda function containing additional parameters in input Parameters ---------- nworker: number of slave process to start up nserver: number of server nodes to start up envs: enviroment variables to be added to the starting programs """ procs = {} for i in range(nworker + nserver): if i < nworker: role = 'worker' else: role = 'server' procs[i] = Thread(target=exec_cmd, args=(args.command, args.local_num_attempt, role, i, envs)) procs[i].setDaemon(True) procs[i].start() # call submit, with nslave, the commands to run each job and submit function tracker.submit(args.num_workers, args.num_servers, fun_submit=mthread_submit, pscmd=(' '.join(args.command)))
def submit(args): assert args.host_file is not None with open(args.host_file) as f: tmp = f.readlines() assert len(tmp) > 0 hosts = [host.strip() for host in tmp if len(host.strip()) > 0] # When submit is called, the workers are assumed to have run 'grpc_worker.py'. def gRPC_submit(nworker, nserver, pass_envs): for i in range(nworker): worker = hosts[i] print('connecting to worker | ip:port | -', worker) # Package dmlc variables into protobuf dmlc_vars = fxgb_pb2.DMLC_VARS( DMLC_TRACKER_URI=pass_envs['DMLC_TRACKER_URI'], DMLC_TRACKER_PORT=pass_envs['DMLC_TRACKER_PORT'], DMLC_ROLE='worker', DMLC_NODE_HOST=worker[:worker.index(':')], DMLC_NUM_WORKER=pass_envs['DMLC_NUM_WORKER'], DMLC_NUM_SERVER=pass_envs['DMLC_NUM_SERVER'], ) # spawn thread to call RPC thread = Thread(target=run, args=(worker, dmlc_vars)) thread.setDaemon(True) thread.start() tracker.submit( args.num_workers, args.num_servers, fun_submit=gRPC_submit, hostIP=args.host_ip, )
def submit(args): assert args.host_file is not None with open(args.host_file) as f: tmp = f.readlines() assert len(tmp) > 0 hosts = [] for h in tmp: if len(h.strip()) > 0: # parse addresses of the form ip:port h = h.strip() i = h.find(":") p = "22" if i != -1: p = h[i + 1:] h = h[:i] # hosts now contain the pair ip, port hosts.append((h, p)) def ssh_submit(nworker, nserver, pass_envs): """ customized submit script """ # thread func to run the job def run(prog): subprocess.check_call(prog, shell=True) # sync programs if necessary local_dir = os.getcwd() + '/' working_dir = local_dir if args.sync_dst_dir is not None and args.sync_dst_dir != 'None': working_dir = args.sync_dst_dir pool = Pool(processes=len(hosts)) for h in hosts: pool.apply_async(sync_dir, args=(local_dir, h, working_dir)) pool.close() pool.join() # launch jobs for i in range(nworker + nserver): pass_envs['DMLC_ROLE'] = 'server' if i < nserver else 'worker' (node, port) = hosts[i % len(hosts)] pass_envs['DMLC_NODE_HOST'] = node pass_envs['PYTHONPATH'] = '/root/singa/build/python/' prog = get_env(pass_envs) + ' cd ' + working_dir + '; ' + ( ' '.join(args.command)) prog = 'ssh -o StrictHostKeyChecking=no ' + node + ' -p ' + port + ' \'' + prog + '\'' thread = Thread(target=run, args=(prog, )) thread.setDaemon(True) thread.start() return ssh_submit tracker.submit(args.num_workers, args.num_servers, fun_submit=ssh_submit, pscmd=(' '.join(args.command)), hostIP=args.host_ip)
def run(self): tracker.config_logger(self.args) env = { 'fun_submit': self.submit(), 'pscmd': self.cmd, } if self.args.localhost: env['hostIP'] = '127.0.0.1' tracker.submit(self.args.num_workers, self.args.num_servers, **env)
def run(self): tracker.config_logger(self.args) env = { 'fun_submit' : self.submit(), 'pscmd' : self.cmd, } if self.args.localhost: env['hostIP'] = '127.0.0.1' tracker.submit(self.args.num_workers, self.args.num_servers, **env)
for k, v in pass_env.items(): env[k] = str(v) env['DMLC_CPU_VCORES'] = str(args.vcores) env['DMLC_MEMORY_MB'] = str(args.memory_mb) env['DMLC_NUM_WORKER'] = str(args.nworker) env['DMLC_NUM_SERVER'] = str(args.server_nodes) env['DMLC_HDFS_OPTS'] = str(args.libhdfs_opts) if args.files != None: for flst in args.files: for f in flst.split('#'): fset.add(f) for f in fset: cmd += ' -file %s' % f cmd += ' -jobname %s ' % args.jobname cmd += ' -tempdir %s ' % args.tempdir cmd += ' -queue %s ' % args.queue cmd += (' '.join(['./run_hdfs_prog.py'] + args.command)) def run(): if args.verbose != 0: print cmd subprocess.check_call(cmd, shell = True, env = env) thread = Thread(target = run, args=()) thread.setDaemon(True) thread.start() tracker.submit(args.nworker, args.server_nodes, fun_submit = yarn_submit, verbose = args.verbose, pscmd= (' '.join(args.command)))
def run(self): tracker.config_logger(self.args) tracker.submit(self.args.num_workers, self.args.num_servers, fun_submit=self.submit(), pscmd=self.cmd)
for f in flst.split('#'): fset.add(f) for f in fset: cmd += ' -file %s' % f cmd += ' -jobname %s ' % args.jobname cmd += ' -tempdir %s ' % args.tempdir cmd += ' -queue %s ' % args.queue if args.app_classpath: cmd += ' -appcp %s ' % args.app_classpath for entry in args.env: cmd += ' -env %s ' % entry cmd += (' '.join(['./run_hdfs_prog.py'] + args.command)) def run(): logging.debug(cmd) subprocess.check_call(cmd, shell=True, env=env) if unknown: cmd += ' ' + ' '.join(unknown) thread = Thread(target=run, args=()) thread.setDaemon(True) thread.start() tracker.config_logger(args) tracker.submit(args.nworker, args.server_nodes, fun_submit=yarn_submit, pscmd=(' '.join([YARN_BOOT_PY] + args.command) + ' ' + ' '.join(unknown)))
env = os.environ.copy() for k, v in pass_env.items(): env[k] = str(v) env['DMLC_CPU_VCORES'] = str(args.vcores) env['DMLC_MEMORY_MB'] = str(args.memory_mb) env['DMLC_NUM_WORKER'] = str(args.nworker) env['DMLC_NUM_SERVER'] = str(args.server_nodes) env['DMLC_HDFS_OPTS'] = str(args.libhdfs_opts) if args.files != None: for flst in args.files: for f in flst.split('#'): fset.add(f) for f in fset: cmd += ' -file %s' % f cmd += ' -jobname %s ' % args.jobname cmd += ' -tempdir %s ' % args.tempdir cmd += ' -queue %s ' % args.queue cmd += (' '.join(['./run_hdfs_prog.py'] + args.command)) def run(): logging.debug(cmd) subprocess.check_call(cmd, shell = True, env = env) thread = Thread(target = run, args=()) thread.setDaemon(True) thread.start() tracker.config_logger(args) tracker.submit(args.nworker, args.server_nodes, fun_submit = yarn_submit, pscmd= (' '.join([YARN_BOOT_PY] + args.command)))
env['DMLC_NUM_WORKER'] = str(args.nworker) env['DMLC_NUM_SERVER'] = str(args.server_nodes) env['DMLC_HDFS_OPTS'] = str(args.libhdfs_opts) if args.files != None: for flst in args.files: for f in flst.split('#'): fset.add(f) for f in fset: cmd += ' -file %s' % f cmd += ' -jobname %s ' % args.jobname cmd += ' -tempdir %s ' % args.tempdir cmd += ' -queue %s ' % args.queue cmd += (' '.join(['./run_hdfs_prog.py'] + args.command)) def run(): if args.verbose != 0: print cmd subprocess.check_call(cmd, shell=True, env=env) thread = Thread(target=run, args=()) thread.setDaemon(True) thread.start() tracker.submit(args.nworker, args.server_nodes, fun_submit=yarn_submit, verbose=args.verbose, pscmd=(' '.join(args.command)))
def run(self): tracker.config_logger(self.args) tracker.submit(self.args.num_workers, self.args.num_servers, fun_submit = self.submit(), pscmd = self.cmd)
""" def run(prog): """""" subprocess.check_call(prog, shell = True) cmd = '' if args.hostfile is not None: cmd = '--hostfile %s' % (args.hostfile) cmd += ' ' + ' '.join(args.command) + ' ' + ' '.join(unknown) # start servers if nserver > 0: pass_envs['DMLC_ROLE'] = 'server' prog = 'mpirun -n %d %s %s' % (nserver, get_mpi_env(pass_envs), cmd) thread = Thread(target = run, args=(prog,)) thread.setDaemon(True) thread.start() if nworker > 0: pass_envs['DMLC_ROLE'] = 'worker' prog = 'mpirun -n %d %s %s' % (nworker, get_mpi_env(pass_envs), cmd) thread = Thread(target = run, args=(prog,)) thread.setDaemon(True) thread.start() tracker.config_logger(args) tracker.submit(args.nworker, args.server_nodes, fun_submit = mpi_submit, hostIP=args.host_ip, pscmd=(' '.join(args.command) + ' ' + ' '.join(unknown)))
def run(self): utils.config_logger(self.args) tracker.submit( self.num_workers, fun_submit=self.submit(), pscmd=self.cmd)
subprocess.check_call(prog, shell = True) cmd = '' if args.hostfile is not None: cmd = '--hostfile %s' % (args.hostfile) cmd += ' ' + ' '.join(args.command) + ' ' + ' '.join(unknown) pass_envs['SEMI_SYNC_MODE'] = 1 pass_envs['SYNC_MODE'] = 1 pass_envs['LEARNING_RATE'] = 0.01 # start servers if nserver > 0: pass_envs['DMLC_ROLE'] = 'server' prog = 'mpirun -n %d %s %s' % (nserver, get_mpi_env(pass_envs), cmd) thread = Thread(target = run, args=(prog,)) thread.setDaemon(True) thread.start() if nworker > 0: pass_envs['DMLC_ROLE'] = 'worker' prog = 'mpirun -n %d %s %s' % (nworker, get_mpi_env(pass_envs), cmd) thread = Thread(target = run, args=(prog,)) thread.setDaemon(True) thread.start() tracker.config_logger(args) tracker.submit(args.nworker, args.server_nodes, fun_submit = mpi_submit, hostIP=args.host_ip, pscmd=(' '.join(args.command) + ' ' + ' '.join(unknown)))
sargs = " ".join(args.command) if args.hostfile is None: cmd = "mpirun -n %d" % (nworker + nserver) else: cmd = "mpirun -n %d --hostfile %s " % (nworker + nserver, args.hostfile) for k, v in pass_envs.items(): cmd += " -env %s %s" % (k, v) # cmd += ' -x %s' % k cmd += " " cmd += " ".join(args.command) cmd += " " cmd += " ".join(unknown) # print '%s' % cmd # known issue: results do not show in emacs eshell def run(): subprocess.check_call(cmd, shell=True, env=env) thread = Thread(target=run, args=()) thread.setDaemon(True) thread.start() tracker.config_logger(args) # call submit, with nslave, the commands to run each job and submit function tracker.submit( args.nworker, args.server_nodes, fun_submit=mpi_submit, pscmd=(" ".join(args.command) + " " + " ".join(unknown)) )
raise Exception('Get nonzero return code=%d' % ret) # # Note: this submit script is only used for demo purpose # submission script using pyhton multi-threading # def mthread_submit(nworker, nserver, envs): """ customized submit script, that submit nslave jobs, each must contain args as parameter note this can be a lambda function containing additional parameters in input Parameters nworker number of slave process to start up nserver number of server nodes to start up envs enviroment variables to be added to the starting programs """ procs = {} for i in range(nworker + nserver): if i < nworker: role = 'worker' else: role = 'server' procs[i] = Thread(target = exec_cmd, args = (args.command + unknown, role, i, envs)) procs[i].setDaemon(True) procs[i].start() tracker.config_logger(args) # call submit, with nslave, the commands to run each job and submit function tracker.submit(args.nworker, args.server_nodes, fun_submit = mthread_submit, pscmd= (' '.join(args.command) + ' ' + ' '.join(unknown)))
env = os.environ.copy() for k, v in pass_env.items(): env[k] = str(v) env['DMLC_CPU_VCORES'] = str(args.vcores) env['DMLC_MEMORY_MB'] = str(args.memory_mb) env['DMLC_NUM_WORKER'] = str(args.nworker) env['DMLC_NUM_SERVER'] = str(args.server_nodes) env['DMLC_HDFS_OPTS'] = str(args.libhdfs_opts) if args.files != None: for flst in args.files: for f in flst.split('#'): fset.add(f) for f in fset: cmd += ' -file %s' % f cmd += ' -jobname %s ' % args.jobname cmd += ' -tempdir %s ' % args.tempdir cmd += ' -queue %s ' % args.queue cmd += (' '.join(['./run_hdfs_prog.py'] + args.command)) def run(): logging.debug(cmd) subprocess.check_call(cmd, shell = True, env = env) thread = Thread(target = run, args=()) thread.setDaemon(True) thread.start() tracker.config_logger(args) tracker.submit(args.nworker, args.server_nodes, fun_submit = yarn_submit, pscmd= (' '.join(['../yarn/run_hdfs_prog.py'] + args.command)))