예제 #1
0
        kmap['timeout'] = 'mapred.task.timeout'
        kmap['memory_mb'] = 'mapred.job.map.memory.mb'
    cmd = '%s jar %s' % (args.hadoop_binary, args.hadoop_streaming_jar)
    cmd += ' -D%s=%d' % (kmap['nworker'], nworker)
    cmd += ' -D%s=%s' % (kmap['jobname'], args.jobname)
    envstr = ','.join('%s=%s' % (k, str(v)) for k, v in worker_envs.items())
    cmd += ' -D%s=\"%s\"' % (kmap['env'], envstr)
    if args.vcores != -1:
        if kmap['nthread'] is None:
            warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
                              'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
        else:
            cmd += ' -D%s=%d' % (kmap['nthread'], args.vcores)
    cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
    if args.memory_mb != -1:
        cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)

    cmd += ' -input %s -output %s' % (args.input, args.output)
    cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + worker_args))
    if args.files != None:
        for flst in args.files:
            for f in flst.split('#'):
                fset.add(f)
    for f in fset:
        cmd += ' -file %s' % f
    print cmd
    subprocess.check_call(cmd, shell = True)

fun_submit = lambda nworker, worker_args, worker_envs: hadoop_streaming(nworker, worker_args, worker_envs, int(hadoop_version[0]) >= 2)
tracker.submit(args.nworker, [], fun_submit = fun_submit, verbose = args.verbose, hostIP = args.host_ip)
    if args.vcores != -1:
        if kmap['nthread'] is None:
            warnings.warn('nthread can only be set in Yarn(Hadoop version greater than 2.0),'\
                              'it is recommended to use Yarn to submit rabit jobs', stacklevel = 2)
        else:
            cmd += ' -D%s=%d' % (kmap['nthread'], args.vcores)
    cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)
    if args.memory_mb != -1:
        cmd += ' -D%s=%d' % (kmap['timeout'], args.timeout)

    cmd += ' -input %s -output %s' % (args.input, args.output)
    cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command +
                                                                worker_args))
    if args.files is not None:
        for flst in args.files:
            for f in flst.split('#'):
                fset.add(f)
    for f in fset:
        cmd += ' -file %s' % f
    print cmd
    subprocess.check_call(cmd, shell=True)


fun_submit = lambda nworker, worker_args, worker_envs: hadoop_streaming(
    nworker, worker_args, worker_envs,
    int(hadoop_version[0]) >= 2)
tracker.submit(args.nworker, [],
               fun_submit=fun_submit,
               verbose=args.verbose,
               hostIP=args.host_ip)
예제 #3
0
fo.write('source ~/.bashrc\n')
fo.write('\"$@\"\n')
fo.close()
#
# submission script using MPI
#
def sge_submit(nslave, worker_args, worker_envs):
    """
      customized submit script, that submit nslave jobs, each must contain args as parameter
      note this can be a lambda function containing additional parameters in input
      Parameters
         nslave number of slave process to start up
         args arguments to launch each job
              this usually includes the parameters of master_uri and parameters passed into submit
    """
    env_arg = ','.join(['%s=\"%s\"' % (k, str(v)) for k, v in worker_envs.items()])
    cmd = 'qsub -cwd -t 1-%d -S /bin/bash' % nslave
    if args.queue != 'default':
        cmd += '-q %s' % args.queue
    cmd += ' -N %s ' % args.jobname
    cmd += ' -e %s -o %s' % (args.logdir, args.logdir)
    cmd += ' -pe orte %d' % (args.vcores)
    cmd += ' -v %s,PATH=${PATH}:.' % env_arg
    cmd += ' %s %s' % (runscript, ' '.join(args.command + worker_args))
    print cmd
    subprocess.check_call(cmd, shell = True)
    print 'Waiting for the jobs to get up...'

# call submit, with nslave, the commands to run each job and submit function
tracker.submit(args.nworker, [], fun_submit = sge_submit, verbose = args.verbose)
예제 #4
0
                raise Exception('Get nonzero return code=%d' % ret)


#
#  Note: this submit script is only used for demo purpose
#  submission script using pyhton multi-threading
#
def mthread_submit(nslave, worker_args, worker_envs):
    """
      customized submit script, that submit nslave jobs, each must contain args as parameter
      note this can be a lambda function containing additional parameters in input
      Parameters
         nslave number of slave process to start up
         args arguments to launch each job
              this usually includes the parameters of master_uri and parameters passed into submit
    """
    procs = {}
    for i in range(nslave):
        procs[i] = Thread(target=exec_cmd,
                          args=(args.command + worker_args, i, worker_envs))
        procs[i].daemon = True
        procs[i].start()
    for i in range(nslave):
        procs[i].join()


# call submit, with nslave, the commands to run each job and submit function
tracker.submit(args.nworker, [],
               fun_submit=mthread_submit,
               verbose=args.verbose)
예제 #5
0
sys.path.append(os.path.dirname(__file__)+'/src/')
import rabit_tracker as tracker

#
#  Note: this submit script is only used for example purpose
#  It does not have to be mpirun, it can be any job submission script that starts the job, qsub, hadoop streaming etc.
#  
def mpi_submit(nslave, args):
    """
      customized submit script, that submit nslave jobs, each must contain args as parameter
      note this can be a lambda function containing additional parameters in input
      Parameters
         nslave number of slave process to start up
         args arguments to launch each job
              this usually includes the parameters of master_uri and parameters passed into submit
    """
    if  args[0] == 'local':
        cmd = ' '.join(['mpirun -n %d' % (nslave)] + args[1:])
    else:
        cmd = ' '.join(['mpirun -n %d --hostfile %s' % (nslave, args[0])] + args[1:])
    print cmd
    subprocess.check_call(cmd, shell = True)

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'Usage: <nslave> <machine_file> <cmd>'
        print 'if <machine_file> == local, we will run using local mode'
        exit(0)        
    # call submit, with nslave, the commands to run each job and submit function
    tracker.submit(int(sys.argv[1]), sys.argv[2:], fun_submit= mpi_submit)
예제 #6
0
if hadoop_binary == None:
  parser.add_argument('-hb', '--hadoop_binary', required=True,
                      help="path-to-hadoop binary folder")
if hadoop_streaming_jar == None:
  parser.add_argument('-hs', '--hadoop_streaming_jar', required=True,
                      help='path-to hadoop streamimg jar file')
parser.add_argument('-i', '--input', required=True)
parser.add_argument('-o', '--output', required=True)
parser.add_argument('-m', '--mapper', required=True)
parser.add_argument('-a', '--args', required=True)
parser.add_argument('-f', '--file', required=True)
args = parser.parse_args()

if hadoop_binary != None:
  args.hadoop_binary = hadoop_binary
if hadoop_streaming_jar != None:
  args.hadoop_streaming_jar = hadoop_streaming_jar

def hadoop_streaming(nslaves, slave_args):
  cmd = '%s jar %s -D mapred.map.tasks=%d' % (args.hadoop_binary, args.hadoop_streaming_jar, nslaves)
  cmd += ' -input %s -output %s' % (args.input, args.output)
  cmd += ' -mapper \"%s %s %s\" -reducer \"/bin/cat\" ' % (args.mapper, args.args, ' '.join(slave_args))
  for f in args.file.split('#'):
    cmd += ' -file %s' % (f)
  print cmd
  subprocess.check_call(cmd, shell = True)

start = time.time()
tracker.submit(args.nslaves, [], fun_submit= hadoop_streaming)
print 'All run took %s' % (time.time() - start)
예제 #7
0
                print 'Thread %d exit with 0' % taskid
            return
        else:
            if os.name == 'nt':
                os.exit(-1)
            else:
                raise Exception('Get nonzero return code=%d' % ret)
#
#  Note: this submit script is only used for demo purpose
#  submission script using pyhton multi-threading
#
def mthread_submit(nslave, worker_args, worker_envs):
    """
      customized submit script, that submit nslave jobs, each must contain args as parameter
      note this can be a lambda function containing additional parameters in input
      Parameters
         nslave number of slave process to start up
         args arguments to launch each job
              this usually includes the parameters of master_uri and parameters passed into submit
    """       
    procs = {}
    for i in range(nslave):
        procs[i] = Thread(target = exec_cmd, args = (args.command + worker_args, i, worker_envs))
        procs[i].daemon = True
        procs[i].start()
    for i in range(nslave):
        procs[i].join()

# call submit, with nslave, the commands to run each job and submit function
tracker.submit(args.nworker, [], fun_submit = mthread_submit, verbose = args.verbose)