def _configure(self): # Find rsh command self.launch_command = ru.which('rsh') if not self.launch_command: raise RuntimeError("rsh not found!")
def _configure(self): # runjob: job launcher for IBM BG/Q systems, e.g. Joule self.launch_command = ru.which('runjob') raise NotImplementedError('RUNJOB LaunchMethod still coupled to ' 'scheduler/ResourceManager')
def _configure(self): # poe: LSF specific wrapper for MPI (e.g. yellowstone) self.launch_command = ru.which('poe') if not self.launch_command: raise RuntimeError("rsh not found!")
def _configure(self): self.launch_command = ru.which([ 'mpirun', # General case 'mpirun_rsh', # Gordon @ SDSC 'mpirun-mpich-mp', # Mac OSX MacPorts 'mpirun-openmpi-mp' # Mac OSX MacPorts ]) self.ccmrun_command = ru.which([ 'ccmrun', # General case ]) if not self.ccmrun_command: raise RuntimeError("ccmrun not found!") self.mpi_version, self.mpi_flavor = self._get_mpi_info(self.launch_command)
def _configure(self): self._mpt = False self._rsh = False if '_rsh' in self.name.lower(): self._rsh = True self.launch_command = ru.which([ 'mpirun_rsh', # Gordon (SDSC) 'mpirun' # general case ]) elif '_mpt' in self.name.lower(): self._mpt = True self.launch_command = ru.which([ 'mpirun_mpt', # Cheyenne (NCAR) 'mpirun' # general case ]) else: self.launch_command = ru.which([ 'mpirun-mpich-mp', # Mac OSX 'mpirun-openmpi-mp', # Mac OSX 'mpirun', # general case ]) # don't use the full pathname as the user might load a different # compiler / MPI library suite from his CU pre_exec that requires # the launcher from that version -- see #572. # FIXME: then why are we doing this LM setup in the first place?? if self.launch_command: self.launch_command = os.path.basename(self.launch_command) # do we need ccmrun or dplace? if '_ccmrun' in self.name: self._ccmrun = ru.which('ccmrun') if not self._ccmrun: raise RuntimeError("ccmrun not found!") if '_dplace' in self.name: self._dplace = ru.which('dplace') if not self._dplace: raise RuntimeError("dplace not found!") self.mpi_version, self.mpi_flavor = \ self._get_mpi_info(self.launch_command)
def _configure(self): self.launch_command = ru.which([ 'mpirun', # General case 'mpirun_rsh', # Gordon @ SDSC 'mpirun-mpich-mp', # Mac OSX MacPorts 'mpirun-openmpi-mp' # Mac OSX MacPorts ]) self.ccmrun_command = ru.which([ 'ccmrun', # General case ]) if not self.ccmrun_command: raise RuntimeError("ccmrun not found!") self.mpi_version, self.mpi_flavor = self._get_mpi_info( self.launch_command)
def test_which () : """ Test if 'which' can find things """ shell_date = os.path.normpath (os.popen ("which date").read().strip()) utils_date = ru.which ('date') assert (shell_date == utils_date), "'%s' != '%s'" % (shell_date, utils_date)
def _configure(self): self.launch_command = ru.which([ 'mpirun', # General case 'mpirun_rsh', # Gordon @ SDSC 'mpirun-mpich-mp', # Mac OSX MacPorts 'mpirun-openmpi-mp' # Mac OSX MacPorts ]) self.mpi_version, self.mpi_flavor = self._get_mpi_info(self.launch_command)
def _configure(self): self.launch_command = ru.which('srun') out, err, ret = ru.sh_callout('%s -V' % self.launch_command) if ret: raise RuntimeError('cannot use srun [%s] [%s]' % (out, err)) self._version = out.split()[-1] self._log.debug('using srun from %s [%s]', self.launch_command, self._version)
def _configure(self): self.launch_command = ru.which([ 'mpiexec', # General case 'mpiexec.mpich', # Linux, MPICH 'mpiexec.hydra', # Linux, MPICH 'mpiexec.openempi', # Linux, MPICH 'mpiexec-mpich-mp', # Mac OSX MacPorts 'mpiexec-openmpi-mp' # Mac OSX MacPorts ]) self.mpi_version, self.mpi_flavor = self._get_mpi_info(self.launch_command)
def discover(cls, logger=None): eenv = cls(logger) # detect nodes, cores and memory available eenv._detect_nodes() eenv._detect_cores_and_memory() # check for 'mpirun' eenv._mpirun_location = which('mpirun') eenv._aprun_location = which('aprun') eenv._ssh_location = which('ssh') # suggest a launch method. the current precendce is # aprun, mpirun, ssh, fork. this can be overrdden # by passing the '--launch-method' parameter to the agent. if eenv._aprun_location is not None: eenv._launch_method = LAUNCH_METHOD_APRUN elif eenv._mpirun_location is not None: eenv._launch_method = LAUNCH_METHOD_MPIRUN elif eenv._ssh_location is not None: eenv._launch_method = LAUNCH_METHOD_SSH else: eenv._launch_method = LAUNCH_METHOD_LOCAL # create node dictionary for rn in eenv._raw_nodes: if rn not in eenv._nodes: eenv._nodes[rn] = {#'_count': 1, 'cores': eenv._cores_per_node, 'memory': eenv._memory_per_node} #else: # eenv._nodes[rn]['_count'] += 1 if logger is not None: logger.info( message="Discovered execution environment: %s" % eenv._nodes, suffix=LOG_SUFFIX) return eenv
def _find_executable(cls, names): """ Takes a (list of) name(s) and looks for an executable in the path. It will return the first match found, or `None` if none of the given names is found. """ if not isinstance(names, list): names = [names] for name in names: ret = ru.which(name) if ret: return ret return None
def _configure(self): # mpirun_rsh (e.g. on Gordon@SDSC, Stampede@TACC) if not ru.which('mpirun_rsh'): raise Exception("mpirun_rsh could not be found") # We don't use the full pathname as the user might load a different # compiler / MPI library suite from his CU pre_exec that requires # the launcher from that version, as experienced on stampede in #572. self.launch_command = 'mpirun_rsh' # alas, the way to transplant env variables to the target node differs # per mpi(run) version... version_info = sp.check_output(['%s -v' % self.launch_command], shell=True) if 'version:' in version_info: self.launch_version = version_info.split(':')[1].strip().lower() else: self.launch_version = 'unknown'
def _configure(self): # Find ssh command command = ru.which('ssh') if command is not None: # Some MPI environments (e.g. SGE) put a link to rsh as "ssh" into # the path. We try to detect that and then use different arguments. if os.path.islink(command): target = os.path.realpath(command) if os.path.basename(target) == 'rsh': self._log.info('Detected that "ssh" is a link to "rsh".') return target command = '%s -o StrictHostKeyChecking=no -o ControlMaster=auto' % command self.launch_command = command
def lrms_shutdown_hook(cls, name, cfg, lrms, lm_info, logger, profiler): """ This hook is symmetric to the config hook above, and is called during shutdown sequence, for the sake of freeing allocated resources. """ if 'dvm_uri' in lm_info: try: logger.info('terminating dvm') orterun = ru.which('orterun') if not orterun: raise Exception("Couldn't find orterun") ru.sh_callout('%s --hnp %s --terminate' % (orterun, lm_info['dvm_uri'])) profiler.prof(event='orte_dvm_stop', uid=cfg['pilot_id']) except Exception as e: # use the same event name as for runtime failures - those are # not distinguishable at the moment from termination failures profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id']) logger.exception('dvm termination failed')
def lrms_shutdown_hook(cls, name, cfg, lrms, lm_info, logger, profiler): """ This hook is symmetric to the config hook above, and is called during shutdown sequence, for the sake of freeing allocated resources. """ if 'dvm_uri' in lm_info: try: logger.info('terminating dvm') orterun = ru.which('orterun') if not orterun: raise Exception("Couldn't find orterun") ru.sh_callout('%s --hnp %s --terminate' % (orterun, lm_info['dvm_uri'])) profiler.prof(event='orte_dvm_stop', uid=cfg['pilot_id']) except Exception as e: # use the same event name as for runtime failures - those are # not distinguishable at the moment from termination failures profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id'], msg=e) logger.exception('dvm termination failed')
def initialize(self): self._pwd = os.getcwd() self.gtod = "%s/gtod" % self._pwd self.register_input(rps.AGENT_EXECUTING_PENDING, rpc.AGENT_EXECUTING_QUEUE, self.work) self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING, rpc.AGENT_STAGING_OUTPUT_QUEUE) self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB) self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb) addr_wrk = self._cfg['bridges']['funcs_req_queue'] addr_res = self._cfg['bridges']['funcs_res_queue'] self._log.debug('wrk in addr: %s', addr_wrk['addr_in' ]) self._log.debug('res out addr: %s', addr_res['addr_out']) self._funcs_req = rpu.Queue(self._session, 'funcs_req_queue', rpu.QUEUE_INPUT, self._cfg, addr_wrk['addr_in']) self._funcs_res = rpu.Queue(self._session, 'funcs_res_queue', rpu.QUEUE_OUTPUT, self._cfg, addr_res['addr_out']) self._cancel_lock = ru.RLock() self._cus_to_cancel = list() self._cus_to_watch = list() self._watch_queue = queue.Queue () self._pid = self._cfg['pid'] # run watcher thread self._collector = mt.Thread(target=self._collect) self._collector.daemon = True self._collector.start() # we need to launch the executors on all nodes, and use the # agent_launcher for that self._launcher = rp.agent.LaunchMethod.create( name = self._cfg.get('agent_launch_method'), cfg = self._cfg, session = self._session) # now run the func launcher on all nodes ve = os.environ.get('VIRTUAL_ENV', '') exe = ru.which('radical-pilot-agent-funcs') if not exe: exe = '%s/rp_install/bin/radical-pilot-agent-funcs' % self._pwd for idx, node in enumerate(self._cfg['rm_info']['node_list']): uid = 'func_exec.%04d' % idx pwd = '%s/%s' % (self._pwd, uid) funcs = {'uid' : uid, 'description': {'executable' : exe, 'arguments' : [pwd, ve], 'cpu_processes': 1, 'environment' : [], }, 'slots' : {'nodes' : [{'name' : node[0], 'uid' : node[1], 'cores' : [[0]], 'gpus' : [] }] }, 'cfg' : {'addr_wrk' : addr_wrk['addr_out'], 'addr_res' : addr_res['addr_in'] } } self._spawn(self._launcher, funcs)
def _create_master_entry (self, url, session, logger) : # FIXME: cache 'which' results, etc # FIXME: check 'which' results with self.rlock : # if True : info = {} info['schema'] = url.schema.lower () info['host_str'] = url.host info['logger'] = logger info['url'] = url info['pass'] = "" info['key_pass'] = {} if not info['schema'] : info['schema'] = 'local' # find out what type of shell we have to deal with if info['schema'] in _SCHEMAS_SSH : info['type'] = "ssh" info['ssh_exe'] = ru.which ("ssh") info['scp_exe'] = ru.which ("scp") info['sftp_exe'] = ru.which ("sftp") elif info['schema'] in _SCHEMAS_GSI : info['type'] = "ssh" info['ssh_exe'] = ru.which ("gsissh") info['scp_exe'] = ru.which ("gsiscp") info['sftp_exe'] = ru.which ("gsisftp") elif info['schema'] in _SCHEMAS_SH : info['type'] = "sh" info['sh_args'] = "-i" info['sh_env'] = "/usr/bin/env TERM=vt100" info['cp_env'] = "/usr/bin/env TERM=vt100" info['fs_root'] = "/" if "SHELL" in os.environ : info['sh_exe'] = ru.which (os.environ["SHELL"]) info['cp_exe'] = ru.which ("cp") else : info['sh_exe'] = ru.which ("sh") info['cp_exe'] = ru.which ("cp") else : raise se.BadParameter._log (self.logger, \ "cannot handle schema '%s://'" % url.schema) # depending on type, create command line (args, env etc) # # We always set term=vt100 to avoid ansi-escape sequences in the prompt # and elsewhere. Also, we have to make sure that the shell is an # interactive login shell, so that it interprets the users startup # files, and reacts on commands. try : info['latency'] = sumisc.get_host_latency (url) # FIXME: note that get_host_latency is considered broken (see # saga/utils/misc.py line 73), and will return a constant 250ms. except Exception as e : info['latency'] = 1.0 # generic value assuming slow link info['logger'].warning ("Could not contact host '%s': %s" % (url, e)) if info['type'] == "sh" : if not sumisc.host_is_local (url.host) : raise se.BadParameter._log (self.logger, \ "expect local host for '%s://', not '%s'" % (url.schema, url.host)) if 'user' in info and info['user'] : pass else : info['user'] = getpass.getuser () else : info['ssh_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes info['scp_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes info['sftp_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes info['ssh_args'] = "-t " # force pty info['scp_args'] = "" info['sftp_args'] = "" if session : for context in session.contexts : # ssh can also handle UserPass contexts, and ssh type contexts. # gsissh can handle the same, but also X509 contexts. if context.type.lower () == "ssh" : if info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI : if context.attribute_exists ("user_id") and context.user_id : info['user'] = context.user_id if context.attribute_exists ("user_key") and context.user_key : info['ssh_args'] += "-o IdentityFile=%s " % context.user_key info['scp_args'] += "-o IdentityFile=%s " % context.user_key info['sftp_args'] += "-o IdentityFile=%s " % context.user_key if context.attribute_exists ("user_pass") and context.user_pass : info['key_pass'][context.user_key] = context.user_pass if context.type.lower () == "userpass" : if info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI : if context.attribute_exists ("user_id") and context.user_id : info['user'] = context.user_id if context.attribute_exists ("user_pass") and context.user_pass : info['pass'] = context.user_pass if context.type.lower () == "x509" : if info['schema'] in _SCHEMAS_GSI : if context.attribute_exists ("user_proxy") and context.user_proxy : info['ssh_env'] += "X509_USER_PROXY='%s' " % context.user_proxy info['scp_env'] += "X509_USER_PROXY='%s' " % context.user_proxy info['sftp_env'] += "X509_USER_PROXY='%s' " % context.user_proxy if context.attribute_exists ("user_cert") and context.user_cert : info['ssh_env'] += "X509_USER_CERT='%s' " % context.user_cert info['scp_env'] += "X509_USER_CERT='%s' " % context.user_cert info['sftp_env'] += "X509_USER_CERT='%s' " % context.user_cert if context.attribute_exists ("user_key") and context.user_key : info['ssh_env'] += "X509_USER_key='%s' " % context.user_key info['scp_env'] += "X509_USER_key='%s' " % context.user_key info['sftp_env'] += "X509_USER_key='%s' " % context.user_key if context.attribute_exists ("cert_repository") and context.cert_repository : info['ssh_env'] += "X509_CERT_DIR='%s' " % context.cert_repository info['scp_env'] += "X509_CERT_DIR='%s' " % context.cert_repository info['sftp_env'] += "X509_CERT_DIR='%s' " % context.cert_repository if url.port and url.port != -1 : info['ssh_args'] += "-p %d " % int(url.port) info['scp_args'] += "-p %d " % int(url.port) info['sftp_args'] += "-P %d " % int(url.port) # all ssh based shells allow for user_id and user_pass from contexts # -- but the data given in the URL take precedence if url.username : info['user'] = url.username if url.password : info['pass'] = url.password ctrl_user = pwd.getpwuid (os.getuid ()).pw_name ctrl_base = "/tmp/saga_ssh_%s" % ctrl_user if 'user' in info and info['user'] : info['host_str'] = "%s@%s" % (info['user'], info['host_str']) info['ctrl'] = "%s_%%h_%%p.%s.%s.ctrl" % (ctrl_base, os.getpid (), info['user']) else : info['user'] = getpass.getuser () info['ctrl'] = "%s_%%h_%%p.%s.ctrl" % (ctrl_base, os.getpid ()) info['m_flags'] = _SSH_FLAGS_MASTER % ({'ctrl' : info['ctrl']}) info['s_flags'] = _SSH_FLAGS_SLAVE % ({'ctrl' : info['ctrl']}) info['fs_root'] = url info['fs_root'].path = "/" # keep all collected info in the master dict, and return it for # registration return info
def _configure(self): self.launch_command = ru.which('orterun')
def _which(self, cmd): ret = ru.which(cmd) if not ret: raise RuntimeError('cmd %s not found' % cmd) return ret
def lrms_config_hook(cls, name, cfg, lrms, logger, profile): import radical.utils as ru if not os.environ.get('SPARK_HOME'): logger.info("Downloading Apache Spark..") try: VERSION = "2.0.2" subprocess.check_call("wget http://d3kbcqa49mib13.cloudfront.net/spark-2.0.2-bin-hadoop2.7.tgz".split()) subprocess.check_call('tar -xzf spark-2.0.2-bin-hadoop2.7.tgz'.split()) subprocess.check_call("rm spark-2.0.2-bin-hadoop2.7.tgz ".split()) subprocess.check_call(("mv spark-2.0.2-bin-hadoop2.7 spark-" + VERSION).split()) except Exception as e: raise RuntimeError("Spark wasn't installed properly. Please try again. %s " % e ) spark_home = os.getcwd() + '/spark-' + VERSION else: spark_home = os.environ['SPARK_HOME'] #------------------------------------------------------------------- platform_os = sys.platform java_home = os.environ.get('JAVA_HOME') if platform_os == "linux" or platform_os == "linux2": if not java_home: java = ru.which('java') if java != '/usr/bin/java': jpos=java.split('bin') else: jpos = os.path.realpath('/usr/bin/java').split('bin') if jpos[0].find('jre') != -1: java_home = jpos[0][:jpos[0].find('jre')] else: java_home = jpos[0] else: if not java_home: try: java_home = subprocess.check_output("/usr/libexec/java_home").split()[0] except Exception: java_home = '/Library/Java/Home' spark_conf_slaves = open(spark_home+"/conf/slaves",'w') if len(lrms.node_list) == 1: spark_conf_slaves.write(lrms.node_list[0])#+hostname) spark_conf_slaves.write('\n') else: for nodename in lrms.node_list[1:]: spark_conf_slaves.write(nodename) # +hostname) spark_conf_slaves.write('\n') spark_conf_slaves.close() ## put Master Ip in spark-env.sh file - if len(lrms.node_list) ==1: master_ip = lrms.node_list[0] else: try: master_ip = subprocess.check_output('hostname -f'.split()).strip() except Exception as e: raise RuntimeError("Master ip couldn't be detected. %s" % e) #Setup default env properties: spark_default_file = open(spark_home + "/conf/spark-defaults.conf",'w') spark_master_string = 'spark://%s:7077' % master_ip spark_default_file.write('spark.master ' + spark_master_string + '\n') spark_default_file.close() logger.info("Let's print the config") logger.info('Config : {0}'.format(cfg['resource_cfg'])) spark_env_file = open(spark_home + "/conf/spark-env.sh",'w') #load in the spark enviroment of master and slaves the #configurations of the machine if master_ip!='localhost': for config in cfg['resource_cfg']['pre_bootstrap_0']: spark_env_file.write(config + '\n') spark_env_file.write('export SPARK_MASTER_HOST=' + master_ip + "\n") spark_env_file.write('export JAVA_HOME=' + java_home + "\n") spark_env_file.write('export SPARK_LOG_DIR='+os.getcwd()+'/spark-logs'+'\n') #spark_env_file.write('export PYSPARK_PYTHON=`which python` \n') spark_env_file.close() #### Start spark Cluster try: subprocess.check_output(spark_home + '/sbin/start-all.sh') except Exception as e: raise RuntimeError("Spark Cluster failed to start: %s" % e) logger.info('Start Spark Cluster') launch_command = spark_home +'/bin' # The LRMS instance is only available here -- everything which is later # needed by the scheduler or launch method is stored in an 'lm_info' # dict. That lm_info dict will be attached to the scheduler's lrms_info # dict, and will be passed around as part of the slots structure, # so it is available on all LM create_command calls. lm_info = {'spark_home' : spark_home, 'master_ip' : master_ip, 'lm_detail' : spark_master_string, 'name' : lrms.name, 'launch_command': launch_command, 'nodename' : lrms.node_list[0]} return lm_info
def _configure(self): # ibrun: wrapper for mpirun at TACC self.launch_command = ru.which('ibrun')
def lrms_config_hook(cls, name, cfg, lrms, logger): """ FIXME: this config hook will inspect the LRMS nodelist and, if needed, will start the YARN cluster on node[0]. """ import radical.utils as ru logger.info('Hook called by YARN LRMS with the name %s'%lrms.name) def config_core_site(node): core_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/core-site.xml','r') lines = core_site_file.readlines() core_site_file.close() prop_str = '<property>\n' prop_str += ' <name>fs.default.name</name>\n' prop_str += ' <value>hdfs://%s:54170</value>\n'%node prop_str += '</property>\n' lines.insert(-1,prop_str) core_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/core-site.xml','w') for line in lines: core_site_file.write(line) core_site_file.close() def config_hdfs_site(nodes): hdfs_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/hdfs-site.xml','r') lines = hdfs_site_file.readlines() hdfs_site_file.close() prop_str = '<property>\n' prop_str += ' <name>dfs.replication</name>\n' prop_str += ' <value>1</value>\n' prop_str += '</property>\n' prop_str += '<property>\n' prop_str += ' <name>dfs.name.dir</name>\n' prop_str += ' <value>file:///tmp/hadoop/hadoopdata/hdfs/namenode</value>\n' prop_str += '</property>\n' prop_str += '<property>\n' prop_str += ' <name>dfs.data.dir</name>\n' prop_str += ' <value>file:///tmp/hadoop/hadoopdata/hdfs/datanode</value>\n' prop_str += '</property>\n' lines.insert(-1,prop_str) hdfs_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/hdfs-site.xml','w') for line in lines: hdfs_site_file.write(line) hdfs_site_file.close() def config_mapred_site(): mapred_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/mapred-site.xml.template','r') lines = mapred_site_file.readlines() mapred_site_file.close() prop_str = ' <property>\n' prop_str += ' <name>mapreduce.framework.name</name>\n' prop_str += ' <value>yarn</value>\n' prop_str += ' </property>\n' lines.insert(-1,prop_str) mapred_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/mapred-site.xml','w') for line in lines: mapred_site_file.write(line) mapred_site_file.close() def config_yarn_site(cores,nodelist,hostname): yarn_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/yarn-site.xml','r') lines = yarn_site_file.readlines() yarn_site_file.close() total_mem_str=subprocess.check_output(['grep','MemTotal','/proc/meminfo']) total_free_mem=int(total_mem_str.split()[1])/1048 if nodelist.__len__() == 1: cores_used = cores/2 total_mem = total_free_mem*0.75 else: cores_used = cores*(len(nodelist)-1) total_mem = total_free_mem*(len(nodelist)-1) slaves = open(os.getcwd()+'/hadoop/etc/hadoop/slaves','w') for node in nodelist[1:]: slaves.write('%s\n'%(node+hostname)) slaves.close() master = open(os.getcwd()+'/hadoop/etc/hadoop/masters','w') master.write('%s\n'%(nodelist[0]+hostname)) master.close() max_app_mem = total_mem/cores_used prop_str = ' <property>\n' prop_str += ' <name>yarn.nodemanager.aux-services</name>\n' prop_str += ' <value>mapreduce_shuffle</value>\n' prop_str += ' </property>\n' prop_str += ' <property>\n' prop_str += ' <name>yarn.scheduler.maximum-allocation-mb</name>\n' prop_str += ' <value>%d</value>\n'%max_app_mem prop_str += ' </property>\n' prop_str += ' <property>\n' prop_str += ' <name>yarn.resourcemanager.hostname</name>\n' prop_str += ' <value>%s</value>\n'%(nodelist[0]+hostname) prop_str += ' </property>\n' prop_str += ' <property>\n' prop_str += ' <name>yarn.nodemanager.resource.cpu-vcores</name>\n' prop_str += ' <value>%d</value>\n'%cores_used prop_str += ' </property>\n' prop_str += ' <property>\n' prop_str += ' <name>yarn.nodemanager.resource.memory-mb</name>\n' prop_str += ' <value>%d</value>\n'%total_mem prop_str += ' </property>\n' lines.insert(-1,prop_str) yarn_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/yarn-site.xml','w') for line in lines: yarn_site_file.write(line) yarn_site_file.close() scheduler_file=open(os.getcwd()+'/hadoop/etc/hadoop/capacity-scheduler.xml','r') lines=scheduler_file.readlines() scheduler_file.close() for line in lines: if line.startswith(' <value>org.apache.hadoop.yarn.util.resource.'): new_line=' <value>org.apache.hadoop.yarn.util.resource.'+'DefaultResourceCalculator</value>\n' lines[lines.index(line)]=new_line elif line.startswith(' <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>'): new_line=' <value>1</value>\n' lines[lines.index(line)+1]=new_line scheduler_file=open(os.getcwd()+'/hadoop/etc/hadoop/capacity-scheduler.xml','w') for line in lines: scheduler_file.write(line) scheduler_file.close() # If the LRMS used is not YARN the namenode url is going to be # the first node in the list and the port is the default one, else # it is the one that the YARN LRMS returns hadoop_home = None if lrms.name == 'YARNLRMS': # FIXME: use constant logger.info('Hook called by YARN LRMS') logger.info('NameNode: {0}'.format(lrms.namenode_url)) service_url = lrms.namenode_url rm_url = "%s:%s" % (lrms.rm_ip, lrms.rm_port) rm_ip = lrms.rm_ip launch_command = ru.which('yarn') else: # Here are the necessary commands to start the cluster. if lrms.node_list[0] == 'localhost': #Download the tar file node_name = lrms.node_list[0] stat = os.system("wget http://apache.claz.org/hadoop/common/hadoop-2.6.0/hadoop-2.6.0.tar.gz") stat = os.system('tar xzf hadoop-2.6.0.tar.gz;mv hadoop-2.6.0 hadoop;rm -rf hadoop-2.6.0.tar.gz') else: node = subprocess.check_output('/bin/hostname') logger.info('Entered Else creation') node_name = node.split('\n')[0] stat = os.system("wget http://apache.claz.org/hadoop/common/hadoop-2.6.0/hadoop-2.6.0.tar.gz") stat = os.system('tar xzf hadoop-2.6.0.tar.gz;mv hadoop-2.6.0 hadoop;rm -rf hadoop-2.6.0.tar.gz') # TODO: Decide how the agent will get Hadoop tar ball. hadoop_home = os.getcwd() + '/hadoop' hadoop_install = hadoop_home hadoop_mapred_home = hadoop_home hadoop_common_home = hadoop_home hadoop_hdfs_home = hadoop_home yarn_home = hadoop_home hadoop_common_lib_native_dir = hadoop_home + '/lib/native' #------------------------------------------------------------------- # Solution to find Java's home folder: # http://stackoverflow.com/questions/1117398/java-home-directory java = ru.which('java') if java != '/usr/bin/java': jpos=java.split('bin') else: jpos = os.path.realpath('/usr/bin/java').split('bin') if jpos[0].find('jre') != -1: java_home = jpos[0][:jpos[0].find('jre')] else: java_home = jpos[0] hadoop_env_file = open(hadoop_home+'/etc/hadoop/hadoop-env.sh','r') hadoop_env_file_lines = hadoop_env_file.readlines() hadoop_env_file.close() hadoop_env_file_lines[24] = 'export JAVA_HOME=%s'%java_home hadoop_env_file = open(hadoop_home+'/etc/hadoop/hadoop-env.sh','w') for line in hadoop_env_file_lines: hadoop_env_file.write(line) hadoop_env_file.close() host=node_name.split(lrms.node_list[0])[1] config_core_site(node_name) config_hdfs_site(lrms.node_list) config_mapred_site() config_yarn_site(lrms.cores_per_node,lrms.node_list,host) logger.info('Start Formatting DFS') namenode_format = os.system(hadoop_home + '/bin/hdfs namenode -format -force') logger.info('DFS Formatted. Starting DFS.') logger.info('Starting YARN') yarn_start = subprocess.check_output([hadoop_home + '/sbin/start-all.sh']) if 'Error' in yarn_start: raise RuntimeError('Unable to start YARN cluster: %s' \ % (yarn_start)) else: logger.info('Started YARN') #------------------------------------------------------------------- # Creating user's HDFS home folder logger.debug('Running: %s/bin/hdfs dfs -mkdir /user'%hadoop_home) os.system('%s/bin/hdfs dfs -mkdir /user'%hadoop_home) uname = subprocess.check_output('whoami').split('\n')[0] logger.debug('Running: %s/bin/hdfs dfs -mkdir /user/%s'%(hadoop_home,uname)) os.system('%s/bin/hdfs dfs -mkdir /user/%s'%(hadoop_home,uname)) check = subprocess.check_output(['%s/bin/hdfs'%hadoop_home,'dfs', '-ls', '/user']) logger.info(check) # FIXME YARN: why was the scheduler configure called here? Configure # is already called during scheduler instantiation # self._scheduler._configure() service_url = node_name + ':54170' rm_url = node_name launch_command = yarn_home + '/bin/yarn' rm_ip = node_name # The LRMS instance is only available here -- everything which is later # needed by the scheduler or launch method is stored in an 'lm_info' # dict. That lm_info dict will be attached to the scheduler's lrms_info # dict, and will be passed around as part of the opaque_slots structure, # so it is available on all LM create_command calls. lm_info = {'service_url' : service_url, 'rm_url' : rm_url, 'hadoop_home' : hadoop_home, 'rm_ip' : rm_ip, 'name' : lrms.name, 'launch_command': launch_command, 'nodename' : lrms.node_list[0] } return lm_info
def lrms_config_hook(cls, name, cfg, lrms, logger, profiler): """ FIXME: this config hook will manipulate the LRMS nodelist. Not a nice thing to do, but hey... :P What really should be happening is that the LRMS digs information on node reservation out of the config and configures the node list accordingly. This config hook should be limited to starting the DVM. """ dvm_command = ru.which('orte-dvm') if not dvm_command: raise Exception("Couldn't find orte-dvm") # Now that we found the orte-dvm, get ORTE version out, err, ret = ru.sh_callout('orte-info | grep "Open RTE"', shell=True) orte_info = dict() for line in out.split('\n'): line = line.strip() if not line: continue key, val = line.split(':', 1) if 'Open RTE' == key.strip(): orte_info['version'] = val.strip() elif 'Open RTE repo revision' == key.strip(): orte_info['version_detail'] = val.strip() assert (orte_info.get('version')) logger.info("Found Open RTE: %s / %s", orte_info['version'], orte_info.get('version_detail')) # Use (g)stdbuf to disable buffering. # We need this to get the "DVM ready", # without waiting for orte-dvm to complete. # The command seems to be generally available on our Cray's, # if not, we can code some home-coooked pty stuff. stdbuf_cmd = ru.which(['stdbuf', 'gstdbuf']) if not stdbuf_cmd: raise Exception("Couldn't find (g)stdbuf") stdbuf_arg = "-oL" # Base command = (g)stdbuf <args> + orte-dvm + debug_args dvm_args = [stdbuf_cmd, stdbuf_arg, dvm_command] # Additional (debug) arguments to orte-dvm if os.environ.get('RADICAL_PILOT_ORTE_VERBOSE'): debug_strings = [ '--debug-devel', '--mca odls_base_verbose 100', '--mca rml_base_verbose 100' ] else: debug_strings = [] # Split up the debug strings into args and add them to the dvm_args [dvm_args.extend(ds.split()) for ds in debug_strings] vm_size = len(lrms.node_list) logger.info("Start DVM on %d nodes ['%s']", vm_size, ' '.join(dvm_args)) profiler.prof(event='orte_dvm_start', uid=cfg['pilot_id']) dvm_uri = None dvm_process = mp.Popen(dvm_args, stdout=mp.PIPE, stderr=mp.STDOUT) while True: line = dvm_process.stdout.readline().strip() if line.startswith('VMURI:'): if len(line.split(' ')) != 2: raise Exception("Unknown VMURI format: %s" % line) label, dvm_uri = line.split(' ', 1) if label != 'VMURI:': raise Exception("Unknown VMURI format: %s" % line) logger.info("ORTE DVM URI: %s" % dvm_uri) elif line == 'DVM ready': if not dvm_uri: raise Exception("VMURI not found!") logger.info("ORTE DVM startup successful!") profiler.prof(event='orte_dvm_ok', uid=cfg['pilot_id']) break else: # Check if the process is still around, # and log output in debug mode. if dvm_process.poll() is None: logger.debug("ORTE: %s", line) else: # Process is gone: fatal! raise Exception("ORTE DVM process disappeared") profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id']) # ---------------------------------------------------------------------- def _watch_dvm(): logger.info('starting DVM watcher') retval = dvm_process.poll() while retval is None: line = dvm_process.stdout.readline().strip() if line: logger.debug('dvm output: %s', line) else: time.sleep(1.0) if retval != 0: # send a kill signal to the main thread. # We know that Python and threading are likely not to play well # with signals - but this is an exceptional case, and not part # of the stadard termination sequence. If the signal is # swallowed, the next `orte-submit` call will trigger # termination anyway. os.kill(os.getpid()) logger.info('DVM stopped (%d)' % dvm_process.returncode) # ---------------------------------------------------------------------- dvm_watcher = ru.Thread(target=_watch_dvm, name="DVMWatcher") dvm_watcher.start() lm_info = {'dvm_uri': dvm_uri, 'version_info': {name: orte_info}} # we need to inform the actual LM instance about the DVM URI. So we # pass it back to the LRMS which will keep it in an 'lm_info', which # will then be passed as part of the slots via the scheduler return lm_info
def _configure(self): # poe: LSF specific wrapper for MPI (e.g. yellowstone) self.launch_command = ru.which('poe')
def _configure(self): # dplace: job launcher for SGI systems (e.g. on Blacklight) self.launch_command = ru.which('dplace')
def _configure(self): # ccmrun: Cluster Compatibility Mode (CCM) job launcher for Cray systems self.launch_command = ru.which('ccmrun')
def _configure(self): # ensure that `prun` is in the path (`which` will raise otherwise) ru.which('prun') self.launch_command = 'prun'
def _create_master_entry (self, url, session, prompt, logger, posix) : # FIXME: cache 'which' results, etc # FIXME: check 'which' results with self.rlock : info = {'posix' : posix} # get and evaluate session config if not session : session = saga.Session (default=True) session_cfg = session.get_config ('saga.utils.pty') info['ssh_copy_mode'] = session_cfg['ssh_copy_mode'].get_value () info['ssh_share_mode'] = session_cfg['ssh_share_mode'].get_value () logger.info ("ssh copy mode set to '%s'" % info['ssh_copy_mode' ]) logger.info ("ssh share mode set to '%s'" % info['ssh_share_mode']) # fill the info dict with details for this master channel, and all # related future slave channels info['schema'] = url.schema.lower () info['host_str'] = url.host info['prompt'] = prompt info['logger'] = logger info['url'] = url info['pass'] = "" info['key_pass'] = {} info['scripts'] = _SCRIPTS if not info['schema'] : info['schema'] = 'local' # find out what type of shell we have to deal with if info['schema'] in _SCHEMAS_SSH : info['shell_type'] = "ssh" info['copy_mode'] = info['ssh_copy_mode'] info['share_mode'] = info['ssh_share_mode'] info['ssh_exe'] = ru.which ("ssh") info['scp_exe'] = ru.which ("scp") info['sftp_exe'] = ru.which ("sftp") elif info['schema'] in _SCHEMAS_GSI : info['shell_type'] = "ssh" info['copy_mode'] = info['ssh_copy_mode'] info['share_mode'] = info['ssh_share_mode'] info['ssh_exe'] = ru.which ("gsissh") info['scp_exe'] = ru.which ("gsiscp") info['sftp_exe'] = ru.which ("gsisftp") elif info['schema'] in _SCHEMAS_SH : info['shell_type'] = "sh" info['copy_mode'] = "sh" info['share_mode'] = "auto" info['sh_args'] = "-i" info['sh_env'] = "/usr/bin/env TERM=vt100 PS1='PROMPT-$?->'" info['cp_env'] = "/usr/bin/env TERM=vt100 PS1='PROMPT-$?->'" info['scp_root'] = "/" if "SHELL" in os.environ : info['sh_exe'] = ru.which (os.environ["SHELL"]) info['cp_exe'] = ru.which ("cp") else : info['sh_exe'] = ru.which ("sh") info['cp_exe'] = ru.which ("cp") else : raise se.BadParameter._log (self.logger, \ "cannot handle schema '%s://'" % url.schema) # depending on type, create command line (args, env etc) # # We always set term=vt100 to avoid ansi-escape sequences in the prompt # and elsewhere. Also, we have to make sure that the shell is an # interactive login shell, so that it interprets the users startup # files, and reacts on commands. try : info['latency'] = sumisc.get_host_latency (url) # FIXME: note that get_host_latency is considered broken (see # saga/utils/misc.py line 73), and will return a constant 250ms. except Exception as e : info['latency'] = 1.0 # generic value assuming slow link info['logger'].warning ("Could not contact host '%s': %s" % (url, e)) if info['shell_type'] == "sh" : info['sh_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes if not sumisc.host_is_local (url.host) : raise se.BadParameter._log (self.logger, \ "expect local host for '%s://', not '%s'" % (url.schema, url.host)) if 'user' in info and info['user'] : pass else : info['user'] = getpass.getuser () else : info['ssh_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes info['scp_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes info['sftp_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes info['ssh_args'] = "-t " # force pty info['scp_args'] = _SCP_FLAGS info['sftp_args'] = _SFTP_FLAGS if session : for context in session.contexts : # ssh can also handle UserPass contexts, and ssh type contexts. # gsissh can handle the same, but also X509 contexts. if context.type.lower () == "ssh" : if info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI : if context.attribute_exists ("user_id") and context.user_id : info['user'] = context.user_id if context.attribute_exists ("user_key") and context.user_key : info['ssh_args'] += "-o IdentityFile=%s " % context.user_key info['scp_args'] += "-o IdentityFile=%s " % context.user_key info['sftp_args'] += "-o IdentityFile=%s " % context.user_key if context.attribute_exists ("user_pass") and context.user_pass : info['key_pass'][context.user_key] = context.user_pass if context.type.lower () == "userpass" : if info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI : if context.attribute_exists ("user_id") and context.user_id : info['user'] = context.user_id if context.attribute_exists ("user_pass") and context.user_pass : info['pass'] = context.user_pass if context.type.lower () == "x509" : if info['schema'] in _SCHEMAS_GSI : if context.attribute_exists ("user_proxy") and context.user_proxy : info['ssh_env'] += "X509_USER_PROXY='%s' " % context.user_proxy info['scp_env'] += "X509_USER_PROXY='%s' " % context.user_proxy info['sftp_env'] += "X509_USER_PROXY='%s' " % context.user_proxy if context.attribute_exists ("user_cert") and context.user_cert : info['ssh_env'] += "X509_USER_CERT='%s' " % context.user_cert info['scp_env'] += "X509_USER_CERT='%s' " % context.user_cert info['sftp_env'] += "X509_USER_CERT='%s' " % context.user_cert if context.attribute_exists ("user_key") and context.user_key : info['ssh_env'] += "X509_USER_key='%s' " % context.user_key info['scp_env'] += "X509_USER_key='%s' " % context.user_key info['sftp_env'] += "X509_USER_key='%s' " % context.user_key if context.attribute_exists ("cert_repository") and context.cert_repository : info['ssh_env'] += "X509_CERT_DIR='%s' " % context.cert_repository info['scp_env'] += "X509_CERT_DIR='%s' " % context.cert_repository info['sftp_env'] += "X509_CERT_DIR='%s' " % context.cert_repository if url.port and url.port != -1 : info['ssh_args'] += "-p %d " % int(url.port) info['scp_args'] += "-p %d " % int(url.port) info['sftp_args'] += "-P %d " % int(url.port) # all ssh based shells allow for user_id and user_pass from contexts # -- but the data given in the URL take precedence if url.username : info['user'] = url.username if url.password : info['pass'] = url.password ctrl_user = pwd.getpwuid (os.getuid ()).pw_name ctrl_base = "/tmp/saga_ssh_%s" % ctrl_user if 'user' in info and info['user'] : info['host_str'] = "%s@%s" % (info['user'], info['host_str']) info['ctrl'] = "%s_%%h_%%p.%s.ctrl" % (ctrl_base, info['user']) else : info['user'] = getpass.getuser () info['ctrl'] = "%s_%%h_%%p.ctrl" % (ctrl_base) info['m_flags'] = _SSH_FLAGS_MASTER % ({'share_mode' : info['share_mode'], 'ctrl' : info['ctrl']}) info['s_flags'] = _SSH_FLAGS_SLAVE % ({'share_mode' : info['share_mode'], 'ctrl' : info['ctrl']}) # we want the userauth and hostname parts of the URL, to get the # scp-scope fs root. info['scp_root'] = "" has_auth = False if url.username : info['scp_root'] += url.username has_auth = True if url.password : info['scp_root'] += ":" info['scp_root'] += url.password has_auth = True if has_auth : info['scp_root'] += "@" info['scp_root'] += "%s:" % url.host # FIXME: port needs to be handled as parameter # if url.port : # info['scp_root'] += ":%d" % url.port # keep all collected info in the master dict, and return it for # registration return info
def rm_config_hook(cls, name, cfg, rm, log, profiler): prte = ru.which('prte') if not prte: raise Exception("Couldn't find prte") # Now that we found the prte, get PRUN version out, _, _ = ru.sh_callout('prte_info | grep "Open RTE"', shell=True) prte_info = dict() for line in out.split('\n'): line = line.strip() if 'Open RTE:' in line: prte_info['version'] = line.split(':')[1].strip() elif 'Open RTE repo revision:' in line: prte_info['version_detail'] = line.split(':')[1].strip() log.info("Found Open RTE: %s [%s]", prte_info.get('version'), prte_info.get('version_detail')) # write hosts file furi = '%s/prrte.uri' % os.getcwd() fhosts = '%s/prrte.hosts' % os.getcwd() vm_size = len(rm.node_list) with open(fhosts, 'w') as fout: for node in rm.node_list: fout.write('%s slots=%d\n' % (node[0], rm.cores_per_node * rm.smt)) pre = os.environ['PRRTE_PREFIX'] prte += ' --prefix %s' % pre prte += ' --report-uri %s' % furi prte += ' --hostfile %s' % fhosts if profiler.enabled: prte += ' --pmca orte_state_base_verbose 1' # prte profiling # large tasks imply large message sizes, and we need to account for that # FIXME: we should derive the message size from DVM size - smaller DVMs # will never need large messages, as they can't run large tasks) prte += ' --pmca ptl_base_max_msg_size %d' % (1024 * 1024 * 1024 * 1) # prte += ' --pmca rmaps_base_verbose 5' # debug mapper problems for large tasks if log.isEnabledFor(logging.DEBUG): prte += ' -pmca orte_rmaps_base_verbose 100' # we apply two temporary tweaks on Summit which should not be needed in # the long run: # # avoid 64 node limit (ssh connection limit) prte += ' --pmca plm_rsh_no_tree_spawn 1' # ensure 1 ssh per dvm prte += ' --pmca plm_rsh_num_concurrent %d' % vm_size # Use (g)stdbuf to disable buffering. We need this to get the # "DVM ready" message to ensure DVM startup completion # # The command seems to be generally available on our Cray's, # if not, we can code some home-coooked pty stuff (TODO) stdbuf_cmd = ru.which(['stdbuf', 'gstdbuf']) if not stdbuf_cmd: raise Exception("Couldn't find (g)stdbuf") stdbuf_arg = "-oL" # Base command = (g)stdbuf <args> + prte + prte-args + debug_args cmdline = '%s %s %s ' % (stdbuf_cmd, stdbuf_arg, prte) # cmdline = prte # Additional (debug) arguments to prte verbose = bool(os.environ.get('RADICAL_PILOT_PRUN_VERBOSE')) if verbose: debug_strings = [ '--debug-devel', '--pmca odls_base_verbose 100', '--pmca rml_base_verbose 100', ] else: debug_strings = [] # Split up the debug strings into args and add them to the cmdline cmdline += ' '.join(debug_strings) cmdline = cmdline.strip() log.info("Start prte on %d nodes [%s]", vm_size, cmdline) profiler.prof(event='dvm_start', uid=cfg['pid']) dvm_uri = None dvm_process = mp.Popen(cmdline.split(), stdout=mp.PIPE, stderr=mp.STDOUT) # ---------------------------------------------------------------------- def _watch_dvm(): log.info('starting prte watcher') retval = dvm_process.poll() while retval is None: line = dvm_process.stdout.readline().strip() if line: log.debug('prte output: %s', line) else: time.sleep(1.0) if retval != 0: # send a kill signal to the main thread. # We know that Python and threading are likely not to play well # with signals - but this is an exceptional case, and not part # of the stadard termination sequence. If the signal is # swallowed, the next `prun` call will trigger # termination anyway. os.kill(os.getpid()) raise RuntimeError('PRTE DVM died') log.info('prte stopped (%d)' % dvm_process.returncode) # ---------------------------------------------------------------------- dvm_watcher = mt.Thread(target=_watch_dvm) dvm_watcher.daemon = True dvm_watcher.start() for _ in range(100): time.sleep(0.5) try: with open(furi, 'r') as fin: for line in fin.readlines(): if '://' in line: dvm_uri = line.strip() break except Exception as e: log.debug('DVM check: uri file missing: %s...' % str(e)[:24]) time.sleep(0.5) if dvm_uri: break if not dvm_uri: raise Exception("VMURI not found!") log.info("prte startup successful: [%s]", dvm_uri) # in some cases, the DVM seems to need some additional time to settle. # FIXME: this should not be needed, really time.sleep(10) profiler.prof(event='dvm_ok', uid=cfg['pid']) lm_info = { 'dvm_uri': dvm_uri, 'version_info': prte_info, 'cvd_id_mode': 'physical' } # we need to inform the actual LaunchMethod instance about the prte URI. # So we pass it back to the ResourceManager which will keep it in an # 'lm_info', which will then be passed as part of the slots via the # scheduler return lm_info
def _configure(self): self.launch_command = ru.which('jsrun') assert(self.launch_command)
def lrms_config_hook(cls, name, cfg, lrms, logger, profiler): """ FIXME: this config hook will manipulate the LRMS nodelist. Not a nice thing to do, but hey... :P What really should be happening is that the LRMS digs information on node reservation out of the config and configures the node list accordingly. This config hook should be limited to starting the DVM. """ dvm_command = ru.which('orte-dvm') if not dvm_command: raise Exception("Couldn't find orte-dvm") # Now that we found the orte-dvm, get ORTE version out, err, ret = ru.sh_callout('orte-info | grep "Open RTE"', shell=True) orte_info = dict() for line in out.split('\n'): line = line.strip() if not line: continue key, val = line.split(':', 1) if 'Open RTE' == key.strip(): orte_info['version'] = val.strip() elif 'Open RTE repo revision' == key.strip(): orte_info['version_detail'] = val.strip() assert(orte_info.get('version')) logger.info("Found Open RTE: %s / %s", orte_info['version'], orte_info.get('version_detail')) # Use (g)stdbuf to disable buffering. # We need this to get the "DVM ready", # without waiting for orte-dvm to complete. # The command seems to be generally available on our Cray's, # if not, we can code some home-coooked pty stuff. stdbuf_cmd = ru.which(['stdbuf', 'gstdbuf']) if not stdbuf_cmd: raise Exception("Couldn't find (g)stdbuf") stdbuf_arg = "-oL" # Base command = (g)stdbuf <args> + orte-dvm + debug_args dvm_args = [stdbuf_cmd, stdbuf_arg, dvm_command] # Additional (debug) arguments to orte-dvm if os.environ.get('RADICAL_PILOT_ORTE_VERBOSE'): debug_strings = [ '--debug-devel', '--mca odls_base_verbose 100', '--mca rml_base_verbose 100' ] else: debug_strings = [] # Split up the debug strings into args and add them to the dvm_args [dvm_args.extend(ds.split()) for ds in debug_strings] vm_size = len(lrms.node_list) logger.info("Start DVM on %d nodes ['%s']", vm_size, ' '.join(dvm_args)) profiler.prof(event='orte_dvm_start', uid=cfg['pilot_id']) dvm_uri = None dvm_process = mp.Popen(dvm_args, stdout=mp.PIPE, stderr=mp.STDOUT) while True: line = dvm_process.stdout.readline().strip() if line.startswith('VMURI:'): if len(line.split(' ')) != 2: raise Exception("Unknown VMURI format: %s" % line) label, dvm_uri = line.split(' ', 1) if label != 'VMURI:': raise Exception("Unknown VMURI format: %s" % line) logger.info("ORTE DVM URI: %s" % dvm_uri) elif line == 'DVM ready': if not dvm_uri: raise Exception("VMURI not found!") logger.info("ORTE DVM startup successful!") profiler.prof(event='orte_dvm_ok', uid=cfg['pilot_id']) break else: # Check if the process is still around, # and log output in debug mode. if dvm_process.poll() is None: logger.debug("ORTE: %s", line) else: # Process is gone: fatal! raise Exception("ORTE DVM process disappeared") profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id']) # ---------------------------------------------------------------------- def _watch_dvm(): logger.info('starting DVM watcher') retval = dvm_process.poll() while retval is None: line = dvm_process.stdout.readline().strip() if line: logger.debug('dvm output: %s', line) else: time.sleep(1.0) if retval != 0: # send a kill signal to the main thread. # We know that Python and threading are likely not to play well # with signals - but this is an exceptional case, and not part # of the stadard termination sequence. If the signal is # swallowed, the next `orte-submit` call will trigger # termination anyway. os.kill(os.getpid()) logger.info('DVM stopped (%d)' % dvm_process.returncode) # ---------------------------------------------------------------------- dvm_watcher = ru.Thread(target=_watch_dvm, name="DVMWatcher") dvm_watcher.start() lm_info = {'dvm_uri' : dvm_uri, 'version_info': {name: orte_info}} # we need to inform the actual LM instance about the DVM URI. So we # pass it back to the LRMS which will keep it in an 'lm_info', which # will then be passed as part of the slots via the scheduler return lm_info
def _configure(self): # Find rsh command self.launch_command = ru.which('rsh')
def _configure(self): self.launch_command = ru.which('orterun') # Request to create a background asynchronous event loop os.putenv("OMPI_MCA_ess_tool_async_progress", "enabled")
def lrms_config_hook(cls, name, cfg, lrms, logger, profile): """ FIXME: this config hook will inspect the LRMS nodelist and, if needed, will start the YARN cluster on node[0]. """ logger.info('Hook called by YARN LRMS with the name %s' % lrms.name) def config_core_site(node): core_site_file = open( os.getcwd() + '/hadoop/etc/hadoop/core-site.xml', 'r') lines = core_site_file.readlines() core_site_file.close() prop_str = '<property>\n' prop_str += ' <name>fs.default.name</name>\n' prop_str += ' <value>hdfs://%s:54170</value>\n' % node prop_str += '</property>\n' lines.insert(-1, prop_str) core_site_file = open( os.getcwd() + '/hadoop/etc/hadoop/core-site.xml', 'w') for line in lines: core_site_file.write(line) core_site_file.close() def config_hdfs_site(nodes): hdfs_site_file = open( os.getcwd() + '/hadoop/etc/hadoop/hdfs-site.xml', 'r') lines = hdfs_site_file.readlines() hdfs_site_file.close() prop_str = '<property>\n' prop_str += ' <name>dfs.replication</name>\n' prop_str += ' <value>1</value>\n' prop_str += '</property>\n' prop_str += '<property>\n' prop_str += ' <name>dfs.name.dir</name>\n' prop_str += ' <value>file:///tmp/hadoop/hadoopdata/hdfs/namenode</value>\n' prop_str += '</property>\n' prop_str += '<property>\n' prop_str += ' <name>dfs.data.dir</name>\n' prop_str += ' <value>file:///tmp/hadoop/hadoopdata/hdfs/datanode</value>\n' prop_str += '</property>\n' lines.insert(-1, prop_str) hdfs_site_file = open( os.getcwd() + '/hadoop/etc/hadoop/hdfs-site.xml', 'w') for line in lines: hdfs_site_file.write(line) hdfs_site_file.close() def config_mapred_site(): mapred_site_file = open( os.getcwd() + '/hadoop/etc/hadoop/mapred-site.xml.template', 'r') lines = mapred_site_file.readlines() mapred_site_file.close() prop_str = ' <property>\n' prop_str += ' <name>mapreduce.framework.name</name>\n' prop_str += ' <value>yarn</value>\n' prop_str += ' </property>\n' lines.insert(-1, prop_str) mapred_site_file = open( os.getcwd() + '/hadoop/etc/hadoop/mapred-site.xml', 'w') for line in lines: mapred_site_file.write(line) mapred_site_file.close() def config_yarn_site(cores, nodelist, hostname): yarn_site_file = open( os.getcwd() + '/hadoop/etc/hadoop/yarn-site.xml', 'r') lines = yarn_site_file.readlines() yarn_site_file.close() total_mem_str = subprocess.check_output( ['grep', 'MemTotal', '/proc/meminfo']) total_free_mem = int(total_mem_str.split()[1]) / 1048 if nodelist.__len__() == 1: cores_used = cores / 2 total_mem = total_free_mem * 0.75 else: cores_used = cores * (len(nodelist) - 1) total_mem = total_free_mem * (len(nodelist) - 1) slaves = open(os.getcwd() + '/hadoop/etc/hadoop/slaves', 'w') for node in nodelist[1:]: slaves.write('%s\n' % (node + hostname)) slaves.close() master = open(os.getcwd() + '/hadoop/etc/hadoop/masters', 'w') master.write('%s\n' % (nodelist[0] + hostname)) master.close() max_app_mem = total_mem / cores_used prop_str = ' <property>\n' prop_str += ' <name>yarn.nodemanager.aux-services</name>\n' prop_str += ' <value>mapreduce_shuffle</value>\n' prop_str += ' </property>\n' prop_str += ' <property>\n' prop_str += ' <name>yarn.scheduler.maximum-allocation-mb</name>\n' prop_str += ' <value>%d</value>\n' % max_app_mem prop_str += ' </property>\n' prop_str += ' <property>\n' prop_str += ' <name>yarn.resourcemanager.hostname</name>\n' prop_str += ' <value>%s</value>\n' % (nodelist[0] + hostname) prop_str += ' </property>\n' prop_str += ' <property>\n' prop_str += ' <name>yarn.nodemanager.resource.cpu-vcores</name>\n' prop_str += ' <value>%d</value>\n' % cores_used prop_str += ' </property>\n' prop_str += ' <property>\n' prop_str += ' <name>yarn.nodemanager.resource.memory-mb</name>\n' prop_str += ' <value>%d</value>\n' % total_mem prop_str += ' </property>\n' lines.insert(-1, prop_str) yarn_site_file = open( os.getcwd() + '/hadoop/etc/hadoop/yarn-site.xml', 'w') for line in lines: yarn_site_file.write(line) yarn_site_file.close() scheduler_file = open( os.getcwd() + '/hadoop/etc/hadoop/capacity-scheduler.xml', 'r') lines = scheduler_file.readlines() scheduler_file.close() for line in lines: if line.startswith( ' <value>org.apache.hadoop.yarn.util.resource.'): new_line = ' <value>org.apache.hadoop.yarn.util.resource.' + 'DefaultResourceCalculator</value>\n' lines[lines.index(line)] = new_line elif line.startswith( ' <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>' ): new_line = ' <value>1</value>\n' lines[lines.index(line) + 1] = new_line scheduler_file = open( os.getcwd() + '/hadoop/etc/hadoop/capacity-scheduler.xml', 'w') for line in lines: scheduler_file.write(line) scheduler_file.close() # If the LRMS used is not YARN the namenode url is going to be # the first node in the list and the port is the default one, else # it is the one that the YARN LRMS returns hadoop_home = None if lrms.name == 'YARNLRMS': # FIXME: use constant logger.info('Hook called by YARN LRMS') logger.info('NameNode: %s', lrms.namenode_url) service_url = lrms.namenode_url rm_url = "%s:%s" % (lrms.rm_ip, lrms.rm_port) rm_ip = lrms.rm_ip launch_command = ru.which('yarn') else: # Here are the necessary commands to start the cluster. if lrms.node_list[0] == 'localhost': #Download the tar file node_name = lrms.node_list[0] stat = os.system( "wget http://apache.claz.org/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz" ) stat = os.system( 'tar xzf hadoop-2.6.5.tar.gz;mv hadoop-2.6.5 hadoop;rm -rf hadoop-2.6.5.tar.gz' ) else: node = subprocess.check_output('/bin/hostname') logger.info('Entered Else creation') node_name = node.split('\n')[0] stat = os.system( "wget http://apache.claz.org/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz" ) stat = os.system( 'tar xzf hadoop-2.6.5.tar.gz;mv hadoop-2.6.5 hadoop;rm -rf hadoop-2.6.5.tar.gz' ) # TODO: Decide how the agent will get Hadoop tar ball. hadoop_home = os.getcwd() + '/hadoop' hadoop_install = hadoop_home hadoop_mapred_home = hadoop_home hadoop_common_home = hadoop_home hadoop_hdfs_home = hadoop_home yarn_home = hadoop_home hadoop_common_lib_native_dir = hadoop_home + '/lib/native' #------------------------------------------------------------------- # Solution to find Java's home folder: # http://stackoverflow.com/questions/1117398/java-home-directory java = ru.which('java') if java != '/usr/bin/java': jpos = java.split('bin') else: jpos = os.path.realpath('/usr/bin/java').split('bin') if jpos[0].find('jre') != -1: java_home = jpos[0][:jpos[0].find('jre')] else: java_home = jpos[0] hadoop_env_file = open(hadoop_home + '/etc/hadoop/hadoop-env.sh', 'r') hadoop_env_file_lines = hadoop_env_file.readlines() hadoop_env_file.close() hadoop_env_file_lines[24] = 'export JAVA_HOME=%s' % java_home hadoop_env_file = open(hadoop_home + '/etc/hadoop/hadoop-env.sh', 'w') for line in hadoop_env_file_lines: hadoop_env_file.write(line) hadoop_env_file.close() host = node_name.split(lrms.node_list[0])[1] config_core_site(node_name) config_hdfs_site(lrms.node_list) config_mapred_site() config_yarn_site(lrms.cores_per_node, lrms.node_list, host) logger.info('Start Formatting DFS') namenode_format = os.system(hadoop_home + '/bin/hdfs namenode -format -force') logger.info('DFS Formatted. Starting DFS.') logger.info('Starting YARN') yarn_start = subprocess.check_output( [hadoop_home + '/sbin/start-all.sh']) if 'Error' in yarn_start: raise RuntimeError('Unable to start YARN cluster: %s' \ % (yarn_start)) else: logger.info('Started YARN') #------------------------------------------------------------------- # Creating user's HDFS home folder logger.debug('Running: %s/bin/hdfs dfs -mkdir /user' % hadoop_home) os.system('%s/bin/hdfs dfs -mkdir /user' % hadoop_home) uname = subprocess.check_output('whoami').split('\n')[0] logger.debug('Running: %s/bin/hdfs dfs -mkdir /user/%s' % (hadoop_home, uname)) os.system('%s/bin/hdfs dfs -mkdir /user/%s' % (hadoop_home, uname)) check = subprocess.check_output( ['%s/bin/hdfs' % hadoop_home, 'dfs', '-ls', '/user']) logger.info(check) logger.info('Getting YARN app') os.system( 'wget https://www.dropbox.com/s/9yxbj9btibgtg40/Pilot-YARN-0.1-jar-with-dependencies.jar' ) # FIXME YARN: why was the scheduler configure called here? Configure # is already called during scheduler instantiation # self._scheduler._configure() service_url = node_name + ':54170' rm_url = node_name launch_command = yarn_home + '/bin/yarn' rm_ip = node_name # The LRMS instance is only available here -- everything which is later # needed by the scheduler or launch method is stored in an 'lm_info' # dict. That lm_info dict will be attached to the scheduler's lrms_info # dict, and will be passed around as part of the opaque_slots structure, # so it is available on all LM create_command calls. lm_info = { 'service_url': service_url, 'rm_url': rm_url, 'hadoop_home': hadoop_home, 'rm_ip': rm_ip, 'name': lrms.name, 'launch_command': launch_command, 'nodename': lrms.node_list[0] } return lm_info
def _create_master_entry(self, url, session, prompt, logger): # FIXME: cache 'which' results, etc # FIXME: check 'which' results with self.rlock: # if True : info = {} info['schema'] = url.schema.lower() info['host_str'] = url.host info['prompt'] = prompt info['logger'] = logger info['url'] = url info['pass'] = "" info['key_pass'] = {} info['scripts'] = _SCRIPTS if not info['schema']: info['schema'] = 'local' # find out what type of shell we have to deal with if info['schema'] in _SCHEMAS_SSH: info['type'] = "ssh" info['ssh_exe'] = ru.which("ssh") info['scp_exe'] = ru.which("scp") info['sftp_exe'] = ru.which("sftp") elif info['schema'] in _SCHEMAS_GSI: info['type'] = "ssh" info['ssh_exe'] = ru.which("gsissh") info['scp_exe'] = ru.which("gsiscp") info['sftp_exe'] = ru.which("gsisftp") elif info['schema'] in _SCHEMAS_SH: info['type'] = "sh" info['sh_args'] = "-i" info['sh_env'] = "/usr/bin/env TERM=vt100 PS1='PROMPT-$?->'" info['cp_env'] = "/usr/bin/env TERM=vt100 PS1='PROMPT-$?->'" info['fs_root'] = "/" if "SHELL" in os.environ: info['sh_exe'] = ru.which(os.environ["SHELL"]) info['cp_exe'] = ru.which("cp") else: info['sh_exe'] = ru.which("sh") info['cp_exe'] = ru.which("cp") else: raise se.BadParameter._log (self.logger, \ "cannot handle schema '%s://'" % url.schema) # depending on type, create command line (args, env etc) # # We always set term=vt100 to avoid ansi-escape sequences in the prompt # and elsewhere. Also, we have to make sure that the shell is an # interactive login shell, so that it interprets the users startup # files, and reacts on commands. try: info['latency'] = sumisc.get_host_latency(url) # FIXME: note that get_host_latency is considered broken (see # saga/utils/misc.py line 73), and will return a constant 250ms. except Exception as e: info['latency'] = 1.0 # generic value assuming slow link info['logger'].warning("Could not contact host '%s': %s" % (url, e)) if info['type'] == "sh": info[ 'sh_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes if not sumisc.host_is_local(url.host): raise se.BadParameter._log (self.logger, \ "expect local host for '%s://', not '%s'" % (url.schema, url.host)) if 'user' in info and info['user']: pass else: info['user'] = getpass.getuser() else: info[ 'ssh_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes info[ 'scp_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes info[ 'sftp_env'] = "/usr/bin/env TERM=vt100 " # avoid ansi escapes info['ssh_args'] = "-t " # force pty info['scp_args'] = "" info['sftp_args'] = "" if session: for context in session.contexts: # ssh can also handle UserPass contexts, and ssh type contexts. # gsissh can handle the same, but also X509 contexts. if context.type.lower() == "ssh": if info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI: if context.attribute_exists( "user_id") and context.user_id: info['user'] = context.user_id if context.attribute_exists( "user_key") and context.user_key: info[ 'ssh_args'] += "-o IdentityFile=%s " % context.user_key info[ 'scp_args'] += "-o IdentityFile=%s " % context.user_key info[ 'sftp_args'] += "-o IdentityFile=%s " % context.user_key if context.attribute_exists( "user_pass") and context.user_pass: info['key_pass'][ context. user_key] = context.user_pass if context.type.lower() == "userpass": if info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI: if context.attribute_exists( "user_id") and context.user_id: info['user'] = context.user_id if context.attribute_exists( "user_pass") and context.user_pass: info['pass'] = context.user_pass if context.type.lower() == "x509": if info['schema'] in _SCHEMAS_GSI: if context.attribute_exists( "user_proxy") and context.user_proxy: info[ 'ssh_env'] += "X509_USER_PROXY='%s' " % context.user_proxy info[ 'scp_env'] += "X509_USER_PROXY='%s' " % context.user_proxy info[ 'sftp_env'] += "X509_USER_PROXY='%s' " % context.user_proxy if context.attribute_exists( "user_cert") and context.user_cert: info[ 'ssh_env'] += "X509_USER_CERT='%s' " % context.user_cert info[ 'scp_env'] += "X509_USER_CERT='%s' " % context.user_cert info[ 'sftp_env'] += "X509_USER_CERT='%s' " % context.user_cert if context.attribute_exists( "user_key") and context.user_key: info[ 'ssh_env'] += "X509_USER_key='%s' " % context.user_key info[ 'scp_env'] += "X509_USER_key='%s' " % context.user_key info[ 'sftp_env'] += "X509_USER_key='%s' " % context.user_key if context.attribute_exists( "cert_repository" ) and context.cert_repository: info[ 'ssh_env'] += "X509_CERT_DIR='%s' " % context.cert_repository info[ 'scp_env'] += "X509_CERT_DIR='%s' " % context.cert_repository info[ 'sftp_env'] += "X509_CERT_DIR='%s' " % context.cert_repository if url.port and url.port != -1: info['ssh_args'] += "-p %d " % int(url.port) info['scp_args'] += "-p %d " % int(url.port) info['sftp_args'] += "-P %d " % int(url.port) # all ssh based shells allow for user_id and user_pass from contexts # -- but the data given in the URL take precedence if url.username: info['user'] = url.username if url.password: info['pass'] = url.password ctrl_user = pwd.getpwuid(os.getuid()).pw_name ctrl_base = "/tmp/saga_ssh_%s" % ctrl_user if 'user' in info and info['user']: info['host_str'] = "%s@%s" % (info['user'], info['host_str']) info['ctrl'] = "%s_%%h_%%p.%s.ctrl" % (ctrl_base, info['user']) else: info['user'] = getpass.getuser() info['ctrl'] = "%s_%%h_%%p.ctrl" % (ctrl_base) info['m_flags'] = _SSH_FLAGS_MASTER % ({'ctrl': info['ctrl']}) info['s_flags'] = _SSH_FLAGS_SLAVE % ({'ctrl': info['ctrl']}) info['fs_root'] = url info['fs_root'].path = "/" # keep all collected info in the master dict, and return it for # registration return info
def lrms_config_hook(cls, name, cfg, lrms, logger, profile): import radical.utils as ru if not os.environ.get('SPARK_HOME'): logger.info("Downloading Apache Spark..") try: VERSION = "2.0.2" subprocess.check_call( "wget http://d3kbcqa49mib13.cloudfront.net/spark-2.0.2-bin-hadoop2.7.tgz" .split()) subprocess.check_call( 'tar -xzf spark-2.0.2-bin-hadoop2.7.tgz'.split()) subprocess.check_call( "rm spark-2.0.2-bin-hadoop2.7.tgz ".split()) subprocess.check_call( ("mv spark-2.0.2-bin-hadoop2.7 spark-" + VERSION).split()) except Exception as e: raise RuntimeError( "Spark wasn't installed properly. Please try again. %s " % e) spark_home = os.getcwd() + '/spark-' + VERSION else: spark_home = os.environ['SPARK_HOME'] #------------------------------------------------------------------- platform_os = sys.platform java_home = os.environ.get('JAVA_HOME') if platform_os == "linux" or platform_os == "linux2": if not java_home: java = ru.which('java') if java != '/usr/bin/java': jpos = java.split('bin') else: jpos = os.path.realpath('/usr/bin/java').split('bin') if jpos[0].find('jre') != -1: java_home = jpos[0][:jpos[0].find('jre')] else: java_home = jpos[0] else: if not java_home: try: java_home = subprocess.check_output( "/usr/libexec/java_home").split()[0] except Exception: java_home = '/Library/Java/Home' spark_conf_slaves = open(spark_home + "/conf/slaves", 'w') if len(lrms.node_list) == 1: spark_conf_slaves.write(lrms.node_list[0]) #+hostname) spark_conf_slaves.write('\n') else: for nodename in lrms.node_list[1:]: spark_conf_slaves.write(nodename) # +hostname) spark_conf_slaves.write('\n') spark_conf_slaves.close() ## put Master Ip in spark-env.sh file - if len(lrms.node_list) == 1: master_ip = lrms.node_list[0] else: try: master_ip = subprocess.check_output( 'hostname -f'.split()).strip() except Exception as e: raise RuntimeError("Master ip couldn't be detected. %s" % e) #Setup default env properties: spark_default_file = open(spark_home + "/conf/spark-defaults.conf", 'w') spark_master_string = 'spark://%s:7077' % master_ip spark_default_file.write('spark.master ' + spark_master_string + '\n') spark_default_file.close() logger.info("Let's print the config") logger.info('Config : {0}'.format(cfg['resource_cfg'])) spark_env_file = open(spark_home + "/conf/spark-env.sh", 'w') #load in the spark enviroment of master and slaves the #configurations of the machine if master_ip != 'localhost': for config in cfg['resource_cfg']['pre_bootstrap_1']: spark_env_file.write(config + '\n') spark_env_file.write('export SPARK_MASTER_HOST=' + master_ip + "\n") spark_env_file.write('export JAVA_HOME=' + java_home + "\n") spark_env_file.write('export SPARK_LOG_DIR=' + os.getcwd() + '/spark-logs' + '\n') #spark_env_file.write('export PYSPARK_PYTHON=`which python` \n') spark_env_file.close() #### Start spark Cluster try: subprocess.check_output(spark_home + '/sbin/start-all.sh') except Exception as e: raise RuntimeError("Spark Cluster failed to start: %s" % e) logger.info('Start Spark Cluster') launch_command = spark_home + '/bin' # The LRMS instance is only available here -- everything which is later # needed by the scheduler or launch method is stored in an 'lm_info' # dict. That lm_info dict will be attached to the scheduler's lrms_info # dict, and will be passed around as part of the opaque_slots structure, # so it is available on all LM create_command calls. lm_info = { 'spark_home': spark_home, 'master_ip': master_ip, 'lm_detail': spark_master_string, 'name': lrms.name, 'launch_command': launch_command, 'nodename': lrms.node_list[0] } return lm_info
def _configure(self): # aprun: job launcher for Cray systems self.launch_command = ru.which('aprun')
def _configure(self): # runjob: job launcher for IBM BG/Q systems, e.g. Joule self.launch_command = ru.which('runjob') raise NotImplementedError('RUNJOB LM still coupled to scheduler/LRMS')