예제 #1
0
    def _configure(self):

        # Find rsh command
        self.launch_command = ru.which('rsh')

        if not self.launch_command:
            raise RuntimeError("rsh not found!")
예제 #2
0
    def _configure(self):

        # runjob: job launcher for IBM BG/Q systems, e.g. Joule
        self.launch_command = ru.which('runjob')

        raise NotImplementedError('RUNJOB LaunchMethod still coupled to '
                                  'scheduler/ResourceManager')
예제 #3
0
    def _configure(self):

        # poe: LSF specific wrapper for MPI (e.g. yellowstone)
        self.launch_command = ru.which('poe')

        if not self.launch_command:
            raise RuntimeError("rsh not found!")
예제 #4
0
    def _configure(self):

        # Find rsh command
        self.launch_command = ru.which('rsh')

        if not self.launch_command:
            raise RuntimeError("rsh not found!")
예제 #5
0
    def _configure(self):

        # poe: LSF specific wrapper for MPI (e.g. yellowstone)
        self.launch_command = ru.which('poe')

        if not self.launch_command:
            raise RuntimeError("rsh not found!")
    def _configure(self):

        self.launch_command = ru.which([
            'mpirun',            # General case
            'mpirun_rsh',        # Gordon @ SDSC
            'mpirun-mpich-mp',   # Mac OSX MacPorts
            'mpirun-openmpi-mp'  # Mac OSX MacPorts
        ])

        self.ccmrun_command = ru.which([
            'ccmrun',            # General case
        ])

        if not self.ccmrun_command:
            raise RuntimeError("ccmrun not found!")

        self.mpi_version, self.mpi_flavor = self._get_mpi_info(self.launch_command)
예제 #7
0
    def _configure(self):

        self._mpt = False
        self._rsh = False

        if '_rsh' in self.name.lower():
            self._rsh = True
            self.launch_command = ru.which([
                'mpirun_rsh',  # Gordon (SDSC)
                'mpirun'  # general case
            ])

        elif '_mpt' in self.name.lower():
            self._mpt = True
            self.launch_command = ru.which([
                'mpirun_mpt',  # Cheyenne (NCAR)
                'mpirun'  # general case
            ])
        else:
            self.launch_command = ru.which([
                'mpirun-mpich-mp',  # Mac OSX
                'mpirun-openmpi-mp',  # Mac OSX
                'mpirun',  # general case
            ])

        # don't use the full pathname as the user might load a different
        # compiler / MPI library suite from his CU pre_exec that requires
        # the launcher from that version -- see #572.
        # FIXME: then why are we doing this LM setup in the first place??
        if self.launch_command:
            self.launch_command = os.path.basename(self.launch_command)

        # do we need ccmrun or dplace?
        if '_ccmrun' in self.name:
            self._ccmrun = ru.which('ccmrun')
            if not self._ccmrun:
                raise RuntimeError("ccmrun not found!")

        if '_dplace' in self.name:
            self._dplace = ru.which('dplace')
            if not self._dplace:
                raise RuntimeError("dplace not found!")

        self.mpi_version, self.mpi_flavor = \
                                       self._get_mpi_info(self.launch_command)
예제 #8
0
    def _configure(self):

        self.launch_command = ru.which([
            'mpirun',  # General case
            'mpirun_rsh',  # Gordon @ SDSC
            'mpirun-mpich-mp',  # Mac OSX MacPorts
            'mpirun-openmpi-mp'  # Mac OSX MacPorts
        ])

        self.ccmrun_command = ru.which([
            'ccmrun',  # General case
        ])

        if not self.ccmrun_command:
            raise RuntimeError("ccmrun not found!")

        self.mpi_version, self.mpi_flavor = self._get_mpi_info(
            self.launch_command)
예제 #9
0
def test_which () :
    """ 
    Test if 'which' can find things
    """

    shell_date = os.path.normpath (os.popen ("which date").read().strip())
    utils_date = ru.which ('date')

    assert (shell_date == utils_date), "'%s' != '%s'" % (shell_date, utils_date)
예제 #10
0
    def _configure(self):

        self.launch_command = ru.which([
            'mpirun',            # General case
            'mpirun_rsh',        # Gordon @ SDSC
            'mpirun-mpich-mp',   # Mac OSX MacPorts
            'mpirun-openmpi-mp'  # Mac OSX MacPorts
        ])

        self.mpi_version, self.mpi_flavor = self._get_mpi_info(self.launch_command)
예제 #11
0
    def _configure(self):

        self.launch_command = ru.which([
            'mpirun',            # General case
            'mpirun_rsh',        # Gordon @ SDSC
            'mpirun-mpich-mp',   # Mac OSX MacPorts
            'mpirun-openmpi-mp'  # Mac OSX MacPorts
        ])

        self.mpi_version, self.mpi_flavor = self._get_mpi_info(self.launch_command)
예제 #12
0
    def _configure(self):

        self.launch_command = ru.which('srun')

        out, err, ret = ru.sh_callout('%s -V' % self.launch_command)
        if ret:
            raise RuntimeError('cannot use srun [%s] [%s]' % (out, err))

        self._version = out.split()[-1]
        self._log.debug('using srun from %s [%s]', self.launch_command,
                        self._version)
예제 #13
0
    def _configure(self):

        self.launch_command = ru.which([
            'mpiexec',            # General case
            'mpiexec.mpich',      # Linux, MPICH
            'mpiexec.hydra',      # Linux, MPICH
            'mpiexec.openempi',   # Linux, MPICH
            'mpiexec-mpich-mp',   # Mac OSX MacPorts
            'mpiexec-openmpi-mp'  # Mac OSX MacPorts
        ])

        self.mpi_version, self.mpi_flavor = self._get_mpi_info(self.launch_command)
    def discover(cls, logger=None):

        eenv = cls(logger)
        # detect nodes, cores and memory available
        eenv._detect_nodes()
        eenv._detect_cores_and_memory()

        # check for 'mpirun'
        eenv._mpirun_location = which('mpirun')
        eenv._aprun_location  = which('aprun')
        eenv._ssh_location    = which('ssh')

        # suggest a launch method. the current precendce is 
        # aprun, mpirun, ssh, fork. this can be overrdden 
        # by passing the '--launch-method' parameter to the agent.

        if eenv._aprun_location is not None:
            eenv._launch_method = LAUNCH_METHOD_APRUN
        elif eenv._mpirun_location is not None:
            eenv._launch_method = LAUNCH_METHOD_MPIRUN
        elif eenv._ssh_location is not None:
            eenv._launch_method = LAUNCH_METHOD_SSH
        else:
            eenv._launch_method = LAUNCH_METHOD_LOCAL

        # create node dictionary
        for rn in eenv._raw_nodes:
            if rn not in eenv._nodes:
                eenv._nodes[rn] = {#'_count': 1,
                                   'cores': eenv._cores_per_node,
                                   'memory': eenv._memory_per_node}
            #else:
            #    eenv._nodes[rn]['_count'] += 1

        if logger is not None:
            logger.info(
                message="Discovered execution environment: %s" % eenv._nodes,
                suffix=LOG_SUFFIX)

        return eenv
예제 #15
0
    def _find_executable(cls, names):
        """
        Takes a (list of) name(s) and looks for an executable in the path.  It
        will return the first match found, or `None` if none of the given names
        is found.
        """

        if not isinstance(names, list):
            names = [names]

        for name in names:
            ret = ru.which(name)
            if ret:
                return ret

        return None
예제 #16
0
    def _configure(self):

        # mpirun_rsh (e.g. on Gordon@SDSC, Stampede@TACC)
        if not ru.which('mpirun_rsh'):
            raise Exception("mpirun_rsh could not be found")

        # We don't use the full pathname as the user might load a different
        # compiler / MPI library suite from his CU pre_exec that requires
        # the launcher from that version, as experienced on stampede in #572.
        self.launch_command = 'mpirun_rsh'

        # alas, the way to transplant env variables to the target node differs
        # per mpi(run) version...
        version_info = sp.check_output(['%s -v' % self.launch_command], shell=True)
        if 'version:' in version_info:
            self.launch_version = version_info.split(':')[1].strip().lower()
        else:
            self.launch_version = 'unknown'
예제 #17
0
    def _configure(self):
        # Find ssh command
        command = ru.which('ssh')

        if command is not None:

            # Some MPI environments (e.g. SGE) put a link to rsh as "ssh" into
            # the path.  We try to detect that and then use different arguments.
            if os.path.islink(command):

                target = os.path.realpath(command)

                if os.path.basename(target) == 'rsh':
                    self._log.info('Detected that "ssh" is a link to "rsh".')
                    return target

            command = '%s -o StrictHostKeyChecking=no -o ControlMaster=auto' % command

        self.launch_command = command
예제 #18
0
    def _configure(self):

        # mpirun_rsh (e.g. on Gordon@SDSC, Stampede@TACC)
        if not ru.which('mpirun_rsh'):
            raise Exception("mpirun_rsh could not be found")

        # We don't use the full pathname as the user might load a different
        # compiler / MPI library suite from his CU pre_exec that requires
        # the launcher from that version, as experienced on stampede in #572.
        self.launch_command = 'mpirun_rsh'

        # alas, the way to transplant env variables to the target node differs
        # per mpi(run) version...
        version_info = sp.check_output(['%s -v' % self.launch_command],
                                       shell=True)
        if 'version:' in version_info:
            self.launch_version = version_info.split(':')[1].strip().lower()
        else:
            self.launch_version = 'unknown'
예제 #19
0
    def lrms_shutdown_hook(cls, name, cfg, lrms, lm_info, logger, profiler):
        """
        This hook is symmetric to the config hook above, and is called during
        shutdown sequence, for the sake of freeing allocated resources.
        """

        if 'dvm_uri' in lm_info:
            try:
                logger.info('terminating dvm')
                orterun = ru.which('orterun')
                if not orterun:
                    raise Exception("Couldn't find orterun")
                ru.sh_callout('%s --hnp %s --terminate' %
                              (orterun, lm_info['dvm_uri']))
                profiler.prof(event='orte_dvm_stop', uid=cfg['pilot_id'])
            except Exception as e:
                # use the same event name as for runtime failures - those are
                # not distinguishable at the moment from termination failures
                profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id'])
                logger.exception('dvm termination failed')
예제 #20
0
    def lrms_shutdown_hook(cls, name, cfg, lrms, lm_info, logger, profiler):
        """
        This hook is symmetric to the config hook above, and is called during
        shutdown sequence, for the sake of freeing allocated resources.
        """

        if 'dvm_uri' in lm_info:
            try:
                logger.info('terminating dvm')
                orterun = ru.which('orterun')
                if not orterun:
                    raise Exception("Couldn't find orterun")
                ru.sh_callout('%s --hnp %s --terminate' 
                             % (orterun, lm_info['dvm_uri']))
                profiler.prof(event='orte_dvm_stop', uid=cfg['pilot_id'])

            except Exception as e:
                # use the same event name as for runtime failures - those are
                # not distinguishable at the moment from termination failures
                profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id'], msg=e)
                logger.exception('dvm termination failed')
예제 #21
0
    def initialize(self):

        self._pwd = os.getcwd()
        self.gtod = "%s/gtod" % self._pwd

        self.register_input(rps.AGENT_EXECUTING_PENDING,
                            rpc.AGENT_EXECUTING_QUEUE, self.work)

        self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
                             rpc.AGENT_STAGING_OUTPUT_QUEUE)

        self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB)
        self.register_subscriber(rpc.CONTROL_PUBSUB, self.command_cb)

        addr_wrk = self._cfg['bridges']['funcs_req_queue']
        addr_res = self._cfg['bridges']['funcs_res_queue']

        self._log.debug('wrk in  addr: %s', addr_wrk['addr_in' ])
        self._log.debug('res out addr: %s', addr_res['addr_out'])

        self._funcs_req = rpu.Queue(self._session, 'funcs_req_queue',
                                    rpu.QUEUE_INPUT, self._cfg,
                                    addr_wrk['addr_in'])
        self._funcs_res = rpu.Queue(self._session, 'funcs_res_queue',
                                    rpu.QUEUE_OUTPUT, self._cfg,
                                    addr_res['addr_out'])

        self._cancel_lock    = ru.RLock()
        self._cus_to_cancel  = list()
        self._cus_to_watch   = list()
        self._watch_queue    = queue.Queue ()

        self._pid = self._cfg['pid']

        # run watcher thread
        self._collector = mt.Thread(target=self._collect)
        self._collector.daemon = True
        self._collector.start()

        # we need to launch the executors on all nodes, and use the
        # agent_launcher for that
        self._launcher = rp.agent.LaunchMethod.create(
                name    = self._cfg.get('agent_launch_method'),
                cfg     = self._cfg,
                session = self._session)

        # now run the func launcher on all nodes
        ve  = os.environ.get('VIRTUAL_ENV',  '')
        exe = ru.which('radical-pilot-agent-funcs')

        if not exe:
            exe = '%s/rp_install/bin/radical-pilot-agent-funcs' % self._pwd

        for idx, node in enumerate(self._cfg['rm_info']['node_list']):
            uid   = 'func_exec.%04d' % idx
            pwd   = '%s/%s' % (self._pwd, uid)
            funcs = {'uid'        : uid,
                     'description': {'executable'   : exe,
                                     'arguments'    : [pwd, ve],
                                     'cpu_processes': 1,
                                     'environment'  : [],
                                    },
                     'slots'      : {'nodes'        : [{'name'  : node[0],
                                                        'uid'   : node[1],
                                                        'cores' : [[0]],
                                                        'gpus'  : []
                                                       }]
                                    },
                     'cfg'        : {'addr_wrk'     : addr_wrk['addr_out'],
                                     'addr_res'     : addr_res['addr_in']
                                    }
                    }
            self._spawn(self._launcher, funcs)
예제 #22
0
    def _create_master_entry (self, url, session, logger) :
        # FIXME: cache 'which' results, etc
        # FIXME: check 'which' results

        with self.rlock :
      # if True :

            info = {}

            info['schema']    = url.schema.lower ()
            info['host_str']  = url.host
            info['logger']    = logger
            info['url']       = url
            info['pass']      = ""
            info['key_pass']  = {}

            if  not info['schema'] :
                info['schema'] = 'local'
                    

            # find out what type of shell we have to deal with
            if  info['schema']   in _SCHEMAS_SSH :
                info['type']     = "ssh"
                info['ssh_exe']  = ru.which ("ssh")
                info['scp_exe']  = ru.which ("scp")
                info['sftp_exe'] = ru.which ("sftp")

            elif info['schema']  in _SCHEMAS_GSI :
                info['type']     = "ssh"
                info['ssh_exe']  = ru.which ("gsissh")
                info['scp_exe']  = ru.which ("gsiscp")
                info['sftp_exe'] = ru.which ("gsisftp")

            elif info['schema']  in _SCHEMAS_SH :
                info['type']     = "sh"
                info['sh_args']  = "-i"
                info['sh_env']   = "/usr/bin/env TERM=vt100"
                info['cp_env']   = "/usr/bin/env TERM=vt100"
                info['fs_root']  = "/"

                if  "SHELL" in os.environ :
                    info['sh_exe'] =  ru.which (os.environ["SHELL"])
                    info['cp_exe'] =  ru.which ("cp")
                else :
                    info['sh_exe'] =  ru.which ("sh")
                    info['cp_exe'] =  ru.which ("cp")

            else :
                raise se.BadParameter._log (self.logger, \
                	  "cannot handle schema '%s://'" % url.schema)


            # depending on type, create command line (args, env etc)
            #
            # We always set term=vt100 to avoid ansi-escape sequences in the prompt
            # and elsewhere.  Also, we have to make sure that the shell is an
            # interactive login shell, so that it interprets the users startup
            # files, and reacts on commands.

            try :
                info['latency'] = sumisc.get_host_latency (url)

                # FIXME: note that get_host_latency is considered broken (see
                # saga/utils/misc.py line 73), and will return a constant 250ms.

            except Exception  as e :
                info['latency'] = 1.0  # generic value assuming slow link
                info['logger'].warning ("Could not contact host '%s': %s" % (url, e))
                
            if  info['type'] == "sh" :

                if not sumisc.host_is_local (url.host) :
                    raise se.BadParameter._log (self.logger, \
                            "expect local host for '%s://', not '%s'" % (url.schema, url.host))

                if  'user' in info and info['user'] :
                    pass
                else :
                    info['user'] = getpass.getuser ()

            else :
                info['ssh_env']   =  "/usr/bin/env TERM=vt100 "  # avoid ansi escapes
                info['scp_env']   =  "/usr/bin/env TERM=vt100 "  # avoid ansi escapes
                info['sftp_env']  =  "/usr/bin/env TERM=vt100 "  # avoid ansi escapes
                info['ssh_args']  =  "-t "                       # force pty
                info['scp_args']  =  ""
                info['sftp_args'] =  ""

                if  session :

                    for context in session.contexts :

                        # ssh can also handle UserPass contexts, and ssh type contexts.
                        # gsissh can handle the same, but also X509 contexts.

                        if  context.type.lower () == "ssh" :
                            if  info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI :

                                if  context.attribute_exists ("user_id") and context.user_id :
                                    info['user']  = context.user_id

                                if  context.attribute_exists ("user_key")  and  context.user_key  :
                                    info['ssh_args']  += "-o IdentityFile=%s " % context.user_key 
                                    info['scp_args']  += "-o IdentityFile=%s " % context.user_key 
                                    info['sftp_args'] += "-o IdentityFile=%s " % context.user_key 

                                    if  context.attribute_exists ("user_pass") and context.user_pass :
                                        info['key_pass'][context.user_key] = context.user_pass

                        if  context.type.lower () == "userpass" :
                            if  info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI :
                                if  context.attribute_exists ("user_id") and context.user_id :
                                    info['user']       = context.user_id
                                if  context.attribute_exists ("user_pass") and context.user_pass :
                                    info['pass']       = context.user_pass

                        if  context.type.lower () == "x509" :
                            if  info['schema'] in _SCHEMAS_GSI :

                                if  context.attribute_exists ("user_proxy")  and   context.user_proxy :
                                    info['ssh_env']   += "X509_USER_PROXY='%s' " % context.user_proxy
                                    info['scp_env']   += "X509_USER_PROXY='%s' " % context.user_proxy
                                    info['sftp_env']  += "X509_USER_PROXY='%s' " % context.user_proxy
                       
                                if  context.attribute_exists ("user_cert")   and  context.user_cert :
                                    info['ssh_env']   += "X509_USER_CERT='%s' " % context.user_cert
                                    info['scp_env']   += "X509_USER_CERT='%s' " % context.user_cert
                                    info['sftp_env']  += "X509_USER_CERT='%s' " % context.user_cert
                       
                                if  context.attribute_exists ("user_key")    and  context.user_key :
                                    info['ssh_env']   += "X509_USER_key='%s' "  % context.user_key
                                    info['scp_env']   += "X509_USER_key='%s' "  % context.user_key
                                    info['sftp_env']  += "X509_USER_key='%s' "  % context.user_key
                       
                                if  context.attribute_exists ("cert_repository") and context.cert_repository :
                                    info['ssh_env']   += "X509_CERT_DIR='%s' "  % context.cert_repository
                                    info['scp_env']   += "X509_CERT_DIR='%s' "  % context.cert_repository
                                    info['sftp_env']  += "X509_CERT_DIR='%s' "  % context.cert_repository

                if url.port and url.port != -1 :
                    info['ssh_args']  += "-p %d " % int(url.port)
                    info['scp_args']  += "-p %d " % int(url.port)
                    info['sftp_args'] += "-P %d " % int(url.port)


                # all ssh based shells allow for user_id and user_pass from contexts
                # -- but the data given in the URL take precedence

                if url.username   :  info['user'] = url.username
                if url.password   :  info['pass'] = url.password

                ctrl_user = pwd.getpwuid (os.getuid ()).pw_name
                ctrl_base = "/tmp/saga_ssh_%s" % ctrl_user


                if  'user' in info and info['user'] :
                    info['host_str'] = "%s@%s"  % (info['user'], info['host_str'])
                    info['ctrl'] = "%s_%%h_%%p.%s.%s.ctrl" % (ctrl_base, os.getpid (), info['user'])
                else :
                    info['user'] = getpass.getuser ()
                    info['ctrl'] = "%s_%%h_%%p.%s.ctrl" % (ctrl_base, os.getpid ())

                info['m_flags']  = _SSH_FLAGS_MASTER % ({'ctrl' : info['ctrl']})
                info['s_flags']  = _SSH_FLAGS_SLAVE  % ({'ctrl' : info['ctrl']})
                info['fs_root']  = url

                info['fs_root'].path = "/"


            # keep all collected info in the master dict, and return it for
            # registration
            return info
예제 #23
0
    def _configure(self):

        self.launch_command = ru.which('orterun')
예제 #24
0
    def _which(self, cmd):

        ret = ru.which(cmd)
        if not ret:
            raise RuntimeError('cmd %s not found' % cmd)
        return ret
예제 #25
0
    def lrms_config_hook(cls, name, cfg, lrms, logger, profile):
        
        import radical.utils as ru

        if not os.environ.get('SPARK_HOME'):
            logger.info("Downloading Apache Spark..")
            try:

                VERSION = "2.0.2"
                subprocess.check_call("wget http://d3kbcqa49mib13.cloudfront.net/spark-2.0.2-bin-hadoop2.7.tgz".split())
                subprocess.check_call('tar -xzf spark-2.0.2-bin-hadoop2.7.tgz'.split())
                subprocess.check_call("rm spark-2.0.2-bin-hadoop2.7.tgz ".split())
                subprocess.check_call(("mv spark-2.0.2-bin-hadoop2.7 spark-" + VERSION).split())
            except  Exception as e:
                raise RuntimeError("Spark wasn't installed properly. Please try again. %s " % e )
            spark_home = os.getcwd() + '/spark-' + VERSION
        else:
            spark_home = os.environ['SPARK_HOME']
        
        #-------------------------------------------------------------------
        platform_os = sys.platform
        java_home = os.environ.get('JAVA_HOME')

        if platform_os == "linux" or platform_os == "linux2":
            if not java_home:
                java = ru.which('java')
                if java != '/usr/bin/java':
                    jpos=java.split('bin')
                else:
                    jpos = os.path.realpath('/usr/bin/java').split('bin')

                if jpos[0].find('jre') != -1:
                    java_home = jpos[0][:jpos[0].find('jre')]
                else:
                    java_home = jpos[0]
        else:
            if not java_home:
                try:
                    java_home = subprocess.check_output("/usr/libexec/java_home").split()[0]
                except Exception:
                    java_home = '/Library/Java/Home'


        spark_conf_slaves = open(spark_home+"/conf/slaves",'w')

        if len(lrms.node_list) == 1:
            spark_conf_slaves.write(lrms.node_list[0])#+hostname)
            spark_conf_slaves.write('\n')
        else:
            for nodename in lrms.node_list[1:]:
                spark_conf_slaves.write(nodename)   # +hostname)
                spark_conf_slaves.write('\n')

        spark_conf_slaves.close()

        ## put Master Ip in spark-env.sh file - 

        if len(lrms.node_list) ==1:
            master_ip = lrms.node_list[0]
        else:
            try:
                master_ip = subprocess.check_output('hostname -f'.split()).strip()
            except Exception as e:
                raise RuntimeError("Master ip couldn't be detected. %s" % e)

        #Setup default env properties:
        spark_default_file = open(spark_home + "/conf/spark-defaults.conf",'w')
        spark_master_string = 'spark://%s:7077' % master_ip
        spark_default_file.write('spark.master  ' + spark_master_string + '\n')
        spark_default_file.close()
        logger.info("Let's print the config")
        logger.info('Config : {0}'.format(cfg['resource_cfg']))

        spark_env_file = open(spark_home + "/conf/spark-env.sh",'w')
        #load in the spark enviroment of master and slaves the
        #configurations of the machine
        if master_ip!='localhost':
            for config in cfg['resource_cfg']['pre_bootstrap_0']:
                spark_env_file.write(config + '\n')

        spark_env_file.write('export SPARK_MASTER_HOST=' + master_ip + "\n")
        spark_env_file.write('export JAVA_HOME=' + java_home + "\n")
        spark_env_file.write('export SPARK_LOG_DIR='+os.getcwd()+'/spark-logs'+'\n')
        #spark_env_file.write('export PYSPARK_PYTHON=`which python` \n')
        spark_env_file.close()


        #### Start spark Cluster
        try:
            subprocess.check_output(spark_home + '/sbin/start-all.sh')
        except Exception as e:
            raise RuntimeError("Spark Cluster failed to start: %s" % e)

        logger.info('Start Spark Cluster')
        launch_command = spark_home +'/bin'

        # The LRMS instance is only available here -- everything which is later
        # needed by the scheduler or launch method is stored in an 'lm_info'
        # dict.  That lm_info dict will be attached to the scheduler's lrms_info
        # dict, and will be passed around as part of the slots structure,
        # so it is available on all LM create_command calls.
        lm_info = {'spark_home'    : spark_home,
                   'master_ip'     : master_ip,
                   'lm_detail'     : spark_master_string,
                   'name'          : lrms.name,
                   'launch_command': launch_command,
                   'nodename'      : lrms.node_list[0]}

        return lm_info
예제 #26
0
    def _which(self, cmd):

        ret = ru.which(cmd)
        if not ret:
            raise RuntimeError('cmd %s not found' % cmd)
        return ret
예제 #27
0
 def _configure(self):
     # ibrun: wrapper for mpirun at TACC
     self.launch_command = ru.which('ibrun')
예제 #28
0
    def lrms_config_hook(cls, name, cfg, lrms, logger):
        """
        FIXME: this config hook will inspect the LRMS nodelist and, if needed,
               will start the YARN cluster on node[0].
        """
        import radical.utils as ru

        logger.info('Hook called by YARN LRMS with the name %s'%lrms.name)

        def config_core_site(node):

            core_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/core-site.xml','r')
            lines = core_site_file.readlines()
            core_site_file.close()

            prop_str  = '<property>\n'
            prop_str += '  <name>fs.default.name</name>\n'
            prop_str += '    <value>hdfs://%s:54170</value>\n'%node
            prop_str += '</property>\n'

            lines.insert(-1,prop_str)

            core_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/core-site.xml','w')
            for line in lines:
                core_site_file.write(line)
            core_site_file.close()

        def config_hdfs_site(nodes):

            hdfs_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/hdfs-site.xml','r')
            lines = hdfs_site_file.readlines()
            hdfs_site_file.close()

            prop_str  = '<property>\n'
            prop_str += ' <name>dfs.replication</name>\n'
            prop_str += ' <value>1</value>\n'
            prop_str += '</property>\n'

            prop_str += '<property>\n'
            prop_str += '  <name>dfs.name.dir</name>\n'
            prop_str += '    <value>file:///tmp/hadoop/hadoopdata/hdfs/namenode</value>\n'
            prop_str += '</property>\n'

            prop_str += '<property>\n'
            prop_str += '  <name>dfs.data.dir</name>\n'
            prop_str += '    <value>file:///tmp/hadoop/hadoopdata/hdfs/datanode</value>\n'
            prop_str += '</property>\n'

            lines.insert(-1,prop_str)

            hdfs_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/hdfs-site.xml','w')
            for line in lines:
                hdfs_site_file.write(line)
            hdfs_site_file.close()

        def config_mapred_site():

            mapred_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/mapred-site.xml.template','r')
            lines = mapred_site_file.readlines()
            mapred_site_file.close()

            prop_str  = ' <property>\n'
            prop_str += '  <name>mapreduce.framework.name</name>\n'
            prop_str += '   <value>yarn</value>\n'
            prop_str += ' </property>\n'

            lines.insert(-1,prop_str)

            mapred_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/mapred-site.xml','w')
            for line in lines:
                mapred_site_file.write(line)
            mapred_site_file.close()

        def config_yarn_site(cores,nodelist,hostname):

            yarn_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/yarn-site.xml','r')
            lines = yarn_site_file.readlines()
            yarn_site_file.close()

            total_mem_str=subprocess.check_output(['grep','MemTotal','/proc/meminfo'])
            total_free_mem=int(total_mem_str.split()[1])/1048

            if nodelist.__len__() == 1:
                cores_used = cores/2
                total_mem = total_free_mem*0.75
            else:
                cores_used = cores*(len(nodelist)-1)
                total_mem = total_free_mem*(len(nodelist)-1)
                slaves = open(os.getcwd()+'/hadoop/etc/hadoop/slaves','w')
                for node in nodelist[1:]:
                    slaves.write('%s\n'%(node+hostname))
                slaves.close()
                master = open(os.getcwd()+'/hadoop/etc/hadoop/masters','w')
                master.write('%s\n'%(nodelist[0]+hostname))
                master.close()

            max_app_mem = total_mem/cores_used

            prop_str  = ' <property>\n'
            prop_str += '  <name>yarn.nodemanager.aux-services</name>\n'
            prop_str += '    <value>mapreduce_shuffle</value>\n'
            prop_str += ' </property>\n'

            prop_str += ' <property>\n'
            prop_str += '  <name>yarn.scheduler.maximum-allocation-mb</name>\n'
            prop_str += '   <value>%d</value>\n'%max_app_mem
            prop_str += ' </property>\n'

            prop_str += ' <property>\n'
            prop_str += '  <name>yarn.resourcemanager.hostname</name>\n'
            prop_str += '   <value>%s</value>\n'%(nodelist[0]+hostname)
            prop_str += ' </property>\n'

            prop_str += ' <property>\n'
            prop_str += '  <name>yarn.nodemanager.resource.cpu-vcores</name>\n'
            prop_str += '   <value>%d</value>\n'%cores_used
            prop_str += ' </property>\n'

            prop_str += ' <property>\n'
            prop_str += '  <name>yarn.nodemanager.resource.memory-mb</name>\n'
            prop_str += '   <value>%d</value>\n'%total_mem
            prop_str += ' </property>\n'

            lines.insert(-1,prop_str)

            yarn_site_file = open(os.getcwd()+'/hadoop/etc/hadoop/yarn-site.xml','w')
            for line in lines:
                yarn_site_file.write(line)
            yarn_site_file.close()

            scheduler_file=open(os.getcwd()+'/hadoop/etc/hadoop/capacity-scheduler.xml','r')
            lines=scheduler_file.readlines()
            scheduler_file.close()

            for line in lines:
                if line.startswith('    <value>org.apache.hadoop.yarn.util.resource.'):
                    new_line='    <value>org.apache.hadoop.yarn.util.resource.'+'DefaultResourceCalculator</value>\n'
                    lines[lines.index(line)]=new_line
                elif line.startswith('    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>'):
                    new_line='    <value>1</value>\n'
                    lines[lines.index(line)+1]=new_line
                        
            scheduler_file=open(os.getcwd()+'/hadoop/etc/hadoop/capacity-scheduler.xml','w')
            for line in lines:
                scheduler_file.write(line)
            
            scheduler_file.close()

        # If the LRMS used is not YARN the namenode url is going to be
        # the first node in the list and the port is the default one, else
        # it is the one that the YARN LRMS returns
        hadoop_home = None
        if lrms.name == 'YARNLRMS': # FIXME: use constant
            logger.info('Hook called by YARN LRMS')
            logger.info('NameNode: {0}'.format(lrms.namenode_url))
            service_url    = lrms.namenode_url
            rm_url         = "%s:%s" % (lrms.rm_ip, lrms.rm_port)
            rm_ip          = lrms.rm_ip
            launch_command = ru.which('yarn')

        else:
            # Here are the necessary commands to start the cluster.
            if lrms.node_list[0] == 'localhost':
                #Download the tar file
                node_name = lrms.node_list[0]
                stat = os.system("wget http://apache.claz.org/hadoop/common/hadoop-2.6.0/hadoop-2.6.0.tar.gz")
                stat = os.system('tar xzf hadoop-2.6.0.tar.gz;mv hadoop-2.6.0 hadoop;rm -rf hadoop-2.6.0.tar.gz')
            else:
                node = subprocess.check_output('/bin/hostname')
                logger.info('Entered Else creation')
                node_name = node.split('\n')[0]
                stat = os.system("wget http://apache.claz.org/hadoop/common/hadoop-2.6.0/hadoop-2.6.0.tar.gz")
                stat = os.system('tar xzf hadoop-2.6.0.tar.gz;mv hadoop-2.6.0 hadoop;rm -rf hadoop-2.6.0.tar.gz')
                # TODO: Decide how the agent will get Hadoop tar ball.


            hadoop_home        = os.getcwd() + '/hadoop'
            hadoop_install     = hadoop_home
            hadoop_mapred_home = hadoop_home
            hadoop_common_home = hadoop_home
            hadoop_hdfs_home   = hadoop_home
            yarn_home          = hadoop_home

            hadoop_common_lib_native_dir = hadoop_home + '/lib/native'

            #-------------------------------------------------------------------
            # Solution to find Java's home folder:
            # http://stackoverflow.com/questions/1117398/java-home-directory

            java = ru.which('java')
            if java != '/usr/bin/java':
                jpos=java.split('bin')
            else:
                jpos = os.path.realpath('/usr/bin/java').split('bin')

            if jpos[0].find('jre') != -1:
                java_home = jpos[0][:jpos[0].find('jre')]
            else:
                java_home = jpos[0]

            hadoop_env_file = open(hadoop_home+'/etc/hadoop/hadoop-env.sh','r')
            hadoop_env_file_lines = hadoop_env_file.readlines()
            hadoop_env_file.close()
            hadoop_env_file_lines[24] = 'export JAVA_HOME=%s'%java_home
            hadoop_env_file = open(hadoop_home+'/etc/hadoop/hadoop-env.sh','w')
            for line in hadoop_env_file_lines:
                hadoop_env_file.write(line)
            hadoop_env_file.close()
            host=node_name.split(lrms.node_list[0])[1]

            config_core_site(node_name)
            config_hdfs_site(lrms.node_list)
            config_mapred_site()
            config_yarn_site(lrms.cores_per_node,lrms.node_list,host)

            logger.info('Start Formatting DFS')
            namenode_format = os.system(hadoop_home + '/bin/hdfs namenode -format -force')
            logger.info('DFS Formatted. Starting DFS.')
            logger.info('Starting YARN')
            yarn_start = subprocess.check_output([hadoop_home + '/sbin/start-all.sh'])
            if 'Error' in yarn_start:
                raise RuntimeError('Unable to start YARN cluster: %s' \
                    % (yarn_start))
            else:
                logger.info('Started YARN')

            #-------------------------------------------------------------------
            # Creating user's HDFS home folder
            logger.debug('Running: %s/bin/hdfs dfs -mkdir /user'%hadoop_home)
            os.system('%s/bin/hdfs dfs -mkdir /user'%hadoop_home)
            uname = subprocess.check_output('whoami').split('\n')[0]
            logger.debug('Running: %s/bin/hdfs dfs -mkdir /user/%s'%(hadoop_home,uname))
            os.system('%s/bin/hdfs dfs -mkdir /user/%s'%(hadoop_home,uname))
            check = subprocess.check_output(['%s/bin/hdfs'%hadoop_home,'dfs', '-ls', '/user'])
            logger.info(check)
            # FIXME YARN: why was the scheduler configure called here?  Configure
            #             is already called during scheduler instantiation
            # self._scheduler._configure()

            service_url = node_name + ':54170'
            rm_url      = node_name
            launch_command = yarn_home + '/bin/yarn'
            rm_ip = node_name


        # The LRMS instance is only available here -- everything which is later
        # needed by the scheduler or launch method is stored in an 'lm_info'
        # dict.  That lm_info dict will be attached to the scheduler's lrms_info
        # dict, and will be passed around as part of the opaque_slots structure,
        # so it is available on all LM create_command calls.
        lm_info = {'service_url'   : service_url,
                   'rm_url'        : rm_url,
                   'hadoop_home'   : hadoop_home,
                   'rm_ip'         : rm_ip,
                   'name'          : lrms.name,
                   'launch_command': launch_command,
                   'nodename'      : lrms.node_list[0] }

        return lm_info
예제 #29
0
    def lrms_config_hook(cls, name, cfg, lrms, logger, profiler):
        """
        FIXME: this config hook will manipulate the LRMS nodelist.  Not a nice
               thing to do, but hey... :P
               What really should be happening is that the LRMS digs information
               on node reservation out of the config and configures the node
               list accordingly.  This config hook should be limited to starting
               the DVM.
        """

        dvm_command = ru.which('orte-dvm')
        if not dvm_command:
            raise Exception("Couldn't find orte-dvm")

        # Now that we found the orte-dvm, get ORTE version
        out, err, ret = ru.sh_callout('orte-info | grep "Open RTE"',
                                      shell=True)
        orte_info = dict()
        for line in out.split('\n'):

            line = line.strip()
            if not line:
                continue

            key, val = line.split(':', 1)
            if 'Open RTE' == key.strip():
                orte_info['version'] = val.strip()
            elif 'Open RTE repo revision' == key.strip():
                orte_info['version_detail'] = val.strip()

        assert (orte_info.get('version'))
        logger.info("Found Open RTE: %s / %s", orte_info['version'],
                    orte_info.get('version_detail'))

        # Use (g)stdbuf to disable buffering.
        # We need this to get the "DVM ready",
        # without waiting for orte-dvm to complete.
        # The command seems to be generally available on our Cray's,
        # if not, we can code some home-coooked pty stuff.
        stdbuf_cmd = ru.which(['stdbuf', 'gstdbuf'])
        if not stdbuf_cmd:
            raise Exception("Couldn't find (g)stdbuf")
        stdbuf_arg = "-oL"

        # Base command = (g)stdbuf <args> + orte-dvm + debug_args
        dvm_args = [stdbuf_cmd, stdbuf_arg, dvm_command]

        # Additional (debug) arguments to orte-dvm
        if os.environ.get('RADICAL_PILOT_ORTE_VERBOSE'):
            debug_strings = [
                '--debug-devel', '--mca odls_base_verbose 100',
                '--mca rml_base_verbose 100'
            ]
        else:
            debug_strings = []

        # Split up the debug strings into args and add them to the dvm_args
        [dvm_args.extend(ds.split()) for ds in debug_strings]

        vm_size = len(lrms.node_list)
        logger.info("Start DVM on %d nodes ['%s']", vm_size,
                    ' '.join(dvm_args))
        profiler.prof(event='orte_dvm_start', uid=cfg['pilot_id'])

        dvm_uri = None
        dvm_process = mp.Popen(dvm_args, stdout=mp.PIPE, stderr=mp.STDOUT)

        while True:

            line = dvm_process.stdout.readline().strip()

            if line.startswith('VMURI:'):

                if len(line.split(' ')) != 2:
                    raise Exception("Unknown VMURI format: %s" % line)

                label, dvm_uri = line.split(' ', 1)

                if label != 'VMURI:':
                    raise Exception("Unknown VMURI format: %s" % line)

                logger.info("ORTE DVM URI: %s" % dvm_uri)

            elif line == 'DVM ready':

                if not dvm_uri:
                    raise Exception("VMURI not found!")

                logger.info("ORTE DVM startup successful!")
                profiler.prof(event='orte_dvm_ok', uid=cfg['pilot_id'])
                break

            else:

                # Check if the process is still around,
                # and log output in debug mode.
                if dvm_process.poll() is None:
                    logger.debug("ORTE: %s", line)
                else:
                    # Process is gone: fatal!
                    raise Exception("ORTE DVM process disappeared")
                    profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id'])

        # ----------------------------------------------------------------------
        def _watch_dvm():

            logger.info('starting DVM watcher')

            retval = dvm_process.poll()
            while retval is None:
                line = dvm_process.stdout.readline().strip()
                if line:
                    logger.debug('dvm output: %s', line)
                else:
                    time.sleep(1.0)

            if retval != 0:
                # send a kill signal to the main thread.
                # We know that Python and threading are likely not to play well
                # with signals - but this is an exceptional case, and not part
                # of the stadard termination sequence.  If the signal is
                # swallowed, the next `orte-submit` call will trigger
                # termination anyway.
                os.kill(os.getpid())

            logger.info('DVM stopped (%d)' % dvm_process.returncode)

        # ----------------------------------------------------------------------

        dvm_watcher = ru.Thread(target=_watch_dvm, name="DVMWatcher")
        dvm_watcher.start()

        lm_info = {'dvm_uri': dvm_uri, 'version_info': {name: orte_info}}

        # we need to inform the actual LM instance about the DVM URI.  So we
        # pass it back to the LRMS which will keep it in an 'lm_info', which
        # will then be passed as part of the slots via the scheduler
        return lm_info
예제 #30
0
 def _configure(self):
     # ibrun: wrapper for mpirun at TACC
     self.launch_command = ru.which('ibrun')
예제 #31
0
    def _configure(self):

        self.launch_command = ru.which('orterun')
예제 #32
0
 def _configure(self):
     # poe: LSF specific wrapper for MPI (e.g. yellowstone)
     self.launch_command = ru.which('poe')
예제 #33
0
    def _configure(self):

        # dplace: job launcher for SGI systems (e.g. on Blacklight)
        self.launch_command = ru.which('dplace')
예제 #34
0
 def _configure(self):
     # ccmrun: Cluster Compatibility Mode (CCM) job launcher for Cray systems
     self.launch_command = ru.which('ccmrun')
예제 #35
0
    def _configure(self):

        # ensure that `prun` is in the path (`which` will raise otherwise)
        ru.which('prun')
        self.launch_command = 'prun'
예제 #36
0
    def _create_master_entry (self, url, session, prompt, logger, posix) :
        # FIXME: cache 'which' results, etc
        # FIXME: check 'which' results

        with self.rlock :


            info = {'posix' : posix}

            # get and evaluate session config
            if  not session :
                session = saga.Session (default=True)

            session_cfg = session.get_config ('saga.utils.pty')
            info['ssh_copy_mode']  = session_cfg['ssh_copy_mode'].get_value ()
            info['ssh_share_mode'] = session_cfg['ssh_share_mode'].get_value ()

            logger.info ("ssh copy  mode set to '%s'" % info['ssh_copy_mode' ])
            logger.info ("ssh share mode set to '%s'" % info['ssh_share_mode'])


            # fill the info dict with details for this master channel, and all
            # related future slave channels
            info['schema']    = url.schema.lower ()
            info['host_str']  = url.host
            info['prompt']    = prompt
            info['logger']    = logger
            info['url']       = url
            info['pass']      = ""
            info['key_pass']  = {}
            info['scripts']   = _SCRIPTS

            if  not info['schema'] :
                info['schema'] = 'local'


            # find out what type of shell we have to deal with
            if  info['schema'] in _SCHEMAS_SSH :
                info['shell_type'] = "ssh"
                info['copy_mode']  = info['ssh_copy_mode']
                info['share_mode'] = info['ssh_share_mode']
                info['ssh_exe']    = ru.which ("ssh")
                info['scp_exe']    = ru.which ("scp")
                info['sftp_exe']   = ru.which ("sftp")

            elif info['schema'] in _SCHEMAS_GSI :
                info['shell_type'] = "ssh"
                info['copy_mode']  = info['ssh_copy_mode']
                info['share_mode'] = info['ssh_share_mode']
                info['ssh_exe']    = ru.which ("gsissh")
                info['scp_exe']    = ru.which ("gsiscp")
                info['sftp_exe']   = ru.which ("gsisftp")

            elif info['schema'] in _SCHEMAS_SH :
                info['shell_type'] = "sh"
                info['copy_mode']  = "sh"
                info['share_mode'] = "auto"
                info['sh_args']    = "-i"
                info['sh_env']     = "/usr/bin/env TERM=vt100 PS1='PROMPT-$?->'"
                info['cp_env']     = "/usr/bin/env TERM=vt100 PS1='PROMPT-$?->'"
                info['scp_root']   = "/"

                if  "SHELL" in os.environ :
                    info['sh_exe'] =  ru.which (os.environ["SHELL"])
                    info['cp_exe'] =  ru.which ("cp")
                else :
                    info['sh_exe'] =  ru.which ("sh")
                    info['cp_exe'] =  ru.which ("cp")

            else :
                raise se.BadParameter._log (self.logger, \
                          "cannot handle schema '%s://'" % url.schema)


            # depending on type, create command line (args, env etc)
            #
            # We always set term=vt100 to avoid ansi-escape sequences in the prompt
            # and elsewhere.  Also, we have to make sure that the shell is an
            # interactive login shell, so that it interprets the users startup
            # files, and reacts on commands.

            try :
                info['latency'] = sumisc.get_host_latency (url)

                # FIXME: note that get_host_latency is considered broken (see
                # saga/utils/misc.py line 73), and will return a constant 250ms.

            except Exception  as e :
                info['latency'] = 1.0  # generic value assuming slow link
                info['logger'].warning ("Could not contact host '%s': %s" % (url, e))

            if  info['shell_type'] == "sh" :

                info['sh_env'] = "/usr/bin/env TERM=vt100 "  # avoid ansi escapes

                if not sumisc.host_is_local (url.host) :
                    raise se.BadParameter._log (self.logger, \
                            "expect local host for '%s://', not '%s'" % (url.schema, url.host))

                if  'user' in info and info['user'] :
                    pass
                else :
                    info['user'] = getpass.getuser ()

            else :
                info['ssh_env']   =  "/usr/bin/env TERM=vt100 "  # avoid ansi escapes
                info['scp_env']   =  "/usr/bin/env TERM=vt100 "  # avoid ansi escapes
                info['sftp_env']  =  "/usr/bin/env TERM=vt100 "  # avoid ansi escapes
                info['ssh_args']  =  "-t "                       # force pty
                info['scp_args']  =  _SCP_FLAGS
                info['sftp_args'] =  _SFTP_FLAGS

                if  session :

                    for context in session.contexts :

                        # ssh can also handle UserPass contexts, and ssh type contexts.
                        # gsissh can handle the same, but also X509 contexts.

                        if  context.type.lower () == "ssh" :
                            if  info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI :

                                if  context.attribute_exists ("user_id") and context.user_id :
                                    info['user']  = context.user_id

                                if  context.attribute_exists ("user_key")  and  context.user_key  :
                                    info['ssh_args']  += "-o IdentityFile=%s " % context.user_key
                                    info['scp_args']  += "-o IdentityFile=%s " % context.user_key
                                    info['sftp_args'] += "-o IdentityFile=%s " % context.user_key

                                    if  context.attribute_exists ("user_pass") and context.user_pass :
                                        info['key_pass'][context.user_key] = context.user_pass

                        if  context.type.lower () == "userpass" :
                            if  info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI :
                                if  context.attribute_exists ("user_id") and context.user_id :
                                    info['user']       = context.user_id
                                if  context.attribute_exists ("user_pass") and context.user_pass :
                                    info['pass']       = context.user_pass

                        if  context.type.lower () == "x509" :
                            if  info['schema'] in _SCHEMAS_GSI :

                                if  context.attribute_exists ("user_proxy")  and   context.user_proxy :
                                    info['ssh_env']   += "X509_USER_PROXY='%s' " % context.user_proxy
                                    info['scp_env']   += "X509_USER_PROXY='%s' " % context.user_proxy
                                    info['sftp_env']  += "X509_USER_PROXY='%s' " % context.user_proxy

                                if  context.attribute_exists ("user_cert")   and  context.user_cert :
                                    info['ssh_env']   += "X509_USER_CERT='%s' " % context.user_cert
                                    info['scp_env']   += "X509_USER_CERT='%s' " % context.user_cert
                                    info['sftp_env']  += "X509_USER_CERT='%s' " % context.user_cert

                                if  context.attribute_exists ("user_key")    and  context.user_key :
                                    info['ssh_env']   += "X509_USER_key='%s' "  % context.user_key
                                    info['scp_env']   += "X509_USER_key='%s' "  % context.user_key
                                    info['sftp_env']  += "X509_USER_key='%s' "  % context.user_key

                                if  context.attribute_exists ("cert_repository") and context.cert_repository :
                                    info['ssh_env']   += "X509_CERT_DIR='%s' "  % context.cert_repository
                                    info['scp_env']   += "X509_CERT_DIR='%s' "  % context.cert_repository
                                    info['sftp_env']  += "X509_CERT_DIR='%s' "  % context.cert_repository

                if url.port and url.port != -1 :
                    info['ssh_args']  += "-p %d " % int(url.port)
                    info['scp_args']  += "-p %d " % int(url.port)
                    info['sftp_args'] += "-P %d " % int(url.port)


                # all ssh based shells allow for user_id and user_pass from contexts
                # -- but the data given in the URL take precedence

                if url.username   :  info['user'] = url.username
                if url.password   :  info['pass'] = url.password

                ctrl_user = pwd.getpwuid (os.getuid ()).pw_name
                ctrl_base = "/tmp/saga_ssh_%s" % ctrl_user


                if  'user' in info and info['user'] :
                    info['host_str'] = "%s@%s"  % (info['user'], info['host_str'])
                    info['ctrl'] = "%s_%%h_%%p.%s.ctrl" % (ctrl_base, info['user'])
                else :
                    info['user'] = getpass.getuser ()
                    info['ctrl'] = "%s_%%h_%%p.ctrl" % (ctrl_base)

                info['m_flags']  = _SSH_FLAGS_MASTER % ({'share_mode' : info['share_mode'],
                                                         'ctrl'       : info['ctrl']})
                info['s_flags']  = _SSH_FLAGS_SLAVE  % ({'share_mode' : info['share_mode'],
                                                         'ctrl'       : info['ctrl']})

                # we want the userauth and hostname parts of the URL, to get the
                # scp-scope fs root.
                info['scp_root']  = ""
                has_auth          = False
                if  url.username :
                    info['scp_root'] += url.username
                    has_auth          = True
                if  url.password :
                    info['scp_root'] += ":"
                    info['scp_root'] += url.password
                    has_auth          = True
                if  has_auth :
                    info['scp_root'] += "@"
                info['scp_root']     += "%s:" % url.host

                # FIXME: port needs to be handled as parameter
              # if  url.port :
              #     info['scp_root'] += ":%d" % url.port


            # keep all collected info in the master dict, and return it for
            # registration
            return info
예제 #37
0
    def rm_config_hook(cls, name, cfg, rm, log, profiler):

        prte = ru.which('prte')
        if not prte:
            raise Exception("Couldn't find prte")

        # Now that we found the prte, get PRUN version
        out, _, _ = ru.sh_callout('prte_info | grep "Open RTE"', shell=True)
        prte_info = dict()
        for line in out.split('\n'):

            line = line.strip()

            if 'Open RTE:' in line:
                prte_info['version'] = line.split(':')[1].strip()

            elif 'Open RTE repo revision:' in line:
                prte_info['version_detail'] = line.split(':')[1].strip()

        log.info("Found Open RTE: %s [%s]", prte_info.get('version'),
                 prte_info.get('version_detail'))

        # write hosts file
        furi = '%s/prrte.uri' % os.getcwd()
        fhosts = '%s/prrte.hosts' % os.getcwd()
        vm_size = len(rm.node_list)

        with open(fhosts, 'w') as fout:
            for node in rm.node_list:
                fout.write('%s slots=%d\n' %
                           (node[0], rm.cores_per_node * rm.smt))

        pre = os.environ['PRRTE_PREFIX']
        prte += ' --prefix %s' % pre
        prte += ' --report-uri %s' % furi
        prte += ' --hostfile %s' % fhosts

        if profiler.enabled:
            prte += ' --pmca orte_state_base_verbose 1'  # prte profiling

        # large tasks imply large message sizes, and we need to account for that
        # FIXME: we should derive the message size from DVM size - smaller DVMs
        #        will never need large messages, as they can't run large tasks)
        prte += ' --pmca ptl_base_max_msg_size %d' % (1024 * 1024 * 1024 * 1)
        # prte += ' --pmca rmaps_base_verbose 5'

        # debug mapper problems for large tasks
        if log.isEnabledFor(logging.DEBUG):
            prte += ' -pmca orte_rmaps_base_verbose 100'

        # we apply two temporary tweaks on Summit which should not be needed in
        # the long run:
        #
        # avoid 64 node limit (ssh connection limit)
        prte += ' --pmca plm_rsh_no_tree_spawn 1'

        # ensure 1 ssh per dvm
        prte += ' --pmca plm_rsh_num_concurrent %d' % vm_size

        # Use (g)stdbuf to disable buffering.  We need this to get the
        # "DVM ready" message to ensure DVM startup completion
        #
        # The command seems to be generally available on our Cray's,
        # if not, we can code some home-coooked pty stuff (TODO)
        stdbuf_cmd = ru.which(['stdbuf', 'gstdbuf'])
        if not stdbuf_cmd:
            raise Exception("Couldn't find (g)stdbuf")
        stdbuf_arg = "-oL"

        # Base command = (g)stdbuf <args> + prte + prte-args + debug_args
        cmdline = '%s %s %s ' % (stdbuf_cmd, stdbuf_arg, prte)
        # cmdline   = prte

        # Additional (debug) arguments to prte
        verbose = bool(os.environ.get('RADICAL_PILOT_PRUN_VERBOSE'))
        if verbose:
            debug_strings = [
                '--debug-devel',
                '--pmca odls_base_verbose 100',
                '--pmca rml_base_verbose 100',
            ]
        else:
            debug_strings = []

        # Split up the debug strings into args and add them to the cmdline
        cmdline += ' '.join(debug_strings)
        cmdline = cmdline.strip()

        log.info("Start prte on %d nodes [%s]", vm_size, cmdline)
        profiler.prof(event='dvm_start', uid=cfg['pid'])

        dvm_uri = None
        dvm_process = mp.Popen(cmdline.split(),
                               stdout=mp.PIPE,
                               stderr=mp.STDOUT)

        # ----------------------------------------------------------------------
        def _watch_dvm():

            log.info('starting prte watcher')

            retval = dvm_process.poll()
            while retval is None:
                line = dvm_process.stdout.readline().strip()
                if line:
                    log.debug('prte output: %s', line)
                else:
                    time.sleep(1.0)

            if retval != 0:
                # send a kill signal to the main thread.
                # We know that Python and threading are likely not to play well
                # with signals - but this is an exceptional case, and not part
                # of the stadard termination sequence.  If the signal is
                # swallowed, the next `prun` call will trigger
                # termination anyway.
                os.kill(os.getpid())
                raise RuntimeError('PRTE DVM died')

            log.info('prte stopped (%d)' % dvm_process.returncode)

        # ----------------------------------------------------------------------

        dvm_watcher = mt.Thread(target=_watch_dvm)
        dvm_watcher.daemon = True
        dvm_watcher.start()

        for _ in range(100):

            time.sleep(0.5)
            try:
                with open(furi, 'r') as fin:
                    for line in fin.readlines():
                        if '://' in line:
                            dvm_uri = line.strip()
                            break

            except Exception as e:
                log.debug('DVM check: uri file missing: %s...' % str(e)[:24])
                time.sleep(0.5)

            if dvm_uri:
                break

        if not dvm_uri:
            raise Exception("VMURI not found!")

        log.info("prte startup successful: [%s]", dvm_uri)

        # in some cases, the DVM seems to need some additional time to settle.
        # FIXME: this should not be needed, really
        time.sleep(10)
        profiler.prof(event='dvm_ok', uid=cfg['pid'])

        lm_info = {
            'dvm_uri': dvm_uri,
            'version_info': prte_info,
            'cvd_id_mode': 'physical'
        }

        # we need to inform the actual LaunchMethod instance about the prte URI.
        # So we pass it back to the ResourceManager which will keep it in an
        # 'lm_info', which will then be passed as part of the slots via the
        # scheduler
        return lm_info
예제 #38
0
    def _configure(self):

        self.launch_command = ru.which('jsrun')
        assert(self.launch_command)
예제 #39
0
    def lrms_config_hook(cls, name, cfg, lrms, logger, profiler):
        """
        FIXME: this config hook will manipulate the LRMS nodelist.  Not a nice
               thing to do, but hey... :P
               What really should be happening is that the LRMS digs information
               on node reservation out of the config and configures the node
               list accordingly.  This config hook should be limited to starting
               the DVM.
        """

        dvm_command = ru.which('orte-dvm')
        if not dvm_command:
            raise Exception("Couldn't find orte-dvm")

        # Now that we found the orte-dvm, get ORTE version
        out, err, ret = ru.sh_callout('orte-info | grep "Open RTE"', shell=True)
        orte_info = dict()
        for line in out.split('\n'):

            line = line.strip()
            if not line:
                continue

            key, val = line.split(':', 1)
            if 'Open RTE' == key.strip():
                orte_info['version'] = val.strip()
            elif  'Open RTE repo revision' == key.strip():
                orte_info['version_detail'] = val.strip()

        assert(orte_info.get('version'))
        logger.info("Found Open RTE: %s / %s",
                    orte_info['version'], orte_info.get('version_detail'))

        # Use (g)stdbuf to disable buffering.
        # We need this to get the "DVM ready",
        # without waiting for orte-dvm to complete.
        # The command seems to be generally available on our Cray's,
        # if not, we can code some home-coooked pty stuff.
        stdbuf_cmd =  ru.which(['stdbuf', 'gstdbuf'])
        if not stdbuf_cmd:
            raise Exception("Couldn't find (g)stdbuf")
        stdbuf_arg = "-oL"

        # Base command = (g)stdbuf <args> + orte-dvm + debug_args
        dvm_args = [stdbuf_cmd, stdbuf_arg, dvm_command]

        # Additional (debug) arguments to orte-dvm
        if os.environ.get('RADICAL_PILOT_ORTE_VERBOSE'):
            debug_strings = [
                             '--debug-devel',
                             '--mca odls_base_verbose 100',
                             '--mca rml_base_verbose 100'
                            ]
        else:
            debug_strings = []

        # Split up the debug strings into args and add them to the dvm_args
        [dvm_args.extend(ds.split()) for ds in debug_strings]

        vm_size = len(lrms.node_list)
        logger.info("Start DVM on %d nodes ['%s']", vm_size, ' '.join(dvm_args))
        profiler.prof(event='orte_dvm_start', uid=cfg['pilot_id'])

        dvm_uri     = None
        dvm_process = mp.Popen(dvm_args, stdout=mp.PIPE, stderr=mp.STDOUT)

        while True:

            line = dvm_process.stdout.readline().strip()

            if line.startswith('VMURI:'):

                if len(line.split(' ')) != 2:
                    raise Exception("Unknown VMURI format: %s" % line)

                label, dvm_uri = line.split(' ', 1)

                if label != 'VMURI:':
                    raise Exception("Unknown VMURI format: %s" % line)

                logger.info("ORTE DVM URI: %s" % dvm_uri)

            elif line == 'DVM ready':

                if not dvm_uri:
                    raise Exception("VMURI not found!")

                logger.info("ORTE DVM startup successful!")
                profiler.prof(event='orte_dvm_ok', uid=cfg['pilot_id'])
                break

            else:

                # Check if the process is still around,
                # and log output in debug mode.
                if dvm_process.poll() is None:
                    logger.debug("ORTE: %s", line)
                else:
                    # Process is gone: fatal!
                    raise Exception("ORTE DVM process disappeared")
                    profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id'])


        # ----------------------------------------------------------------------
        def _watch_dvm():

            logger.info('starting DVM watcher')

            retval = dvm_process.poll()
            while retval is None:
                line = dvm_process.stdout.readline().strip()
                if line:
                    logger.debug('dvm output: %s', line)
                else:
                    time.sleep(1.0)

            if retval != 0:
                # send a kill signal to the main thread.
                # We know that Python and threading are likely not to play well
                # with signals - but this is an exceptional case, and not part
                # of the stadard termination sequence.  If the signal is
                # swallowed, the next `orte-submit` call will trigger
                # termination anyway.
                os.kill(os.getpid())

            logger.info('DVM stopped (%d)' % dvm_process.returncode)
        # ----------------------------------------------------------------------

        dvm_watcher = ru.Thread(target=_watch_dvm, name="DVMWatcher")
        dvm_watcher.start()

        lm_info = {'dvm_uri'     : dvm_uri,
                   'version_info': {name: orte_info}}

        # we need to inform the actual LM instance about the DVM URI.  So we
        # pass it back to the LRMS which will keep it in an 'lm_info', which
        # will then be passed as part of the slots via the scheduler
        return lm_info
예제 #40
0
    def _configure(self):

        # Find rsh command
        self.launch_command = ru.which('rsh')
예제 #41
0
    def _configure(self):

        self.launch_command = ru.which('orterun')

        # Request to create a background asynchronous event loop
        os.putenv("OMPI_MCA_ess_tool_async_progress", "enabled")
예제 #42
0
    def lrms_config_hook(cls, name, cfg, lrms, logger, profile):
        """
        FIXME: this config hook will inspect the LRMS nodelist and, if needed,
               will start the YARN cluster on node[0].
        """
        logger.info('Hook called by YARN LRMS with the name %s' % lrms.name)

        def config_core_site(node):

            core_site_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/core-site.xml', 'r')
            lines = core_site_file.readlines()
            core_site_file.close()

            prop_str = '<property>\n'
            prop_str += '  <name>fs.default.name</name>\n'
            prop_str += '    <value>hdfs://%s:54170</value>\n' % node
            prop_str += '</property>\n'

            lines.insert(-1, prop_str)

            core_site_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/core-site.xml', 'w')
            for line in lines:
                core_site_file.write(line)
            core_site_file.close()

        def config_hdfs_site(nodes):

            hdfs_site_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/hdfs-site.xml', 'r')
            lines = hdfs_site_file.readlines()
            hdfs_site_file.close()

            prop_str = '<property>\n'
            prop_str += ' <name>dfs.replication</name>\n'
            prop_str += ' <value>1</value>\n'
            prop_str += '</property>\n'

            prop_str += '<property>\n'
            prop_str += '  <name>dfs.name.dir</name>\n'
            prop_str += '    <value>file:///tmp/hadoop/hadoopdata/hdfs/namenode</value>\n'
            prop_str += '</property>\n'

            prop_str += '<property>\n'
            prop_str += '  <name>dfs.data.dir</name>\n'
            prop_str += '    <value>file:///tmp/hadoop/hadoopdata/hdfs/datanode</value>\n'
            prop_str += '</property>\n'

            lines.insert(-1, prop_str)

            hdfs_site_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/hdfs-site.xml', 'w')
            for line in lines:
                hdfs_site_file.write(line)
            hdfs_site_file.close()

        def config_mapred_site():

            mapred_site_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/mapred-site.xml.template',
                'r')
            lines = mapred_site_file.readlines()
            mapred_site_file.close()

            prop_str = ' <property>\n'
            prop_str += '  <name>mapreduce.framework.name</name>\n'
            prop_str += '   <value>yarn</value>\n'
            prop_str += ' </property>\n'

            lines.insert(-1, prop_str)

            mapred_site_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/mapred-site.xml', 'w')
            for line in lines:
                mapred_site_file.write(line)
            mapred_site_file.close()

        def config_yarn_site(cores, nodelist, hostname):

            yarn_site_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/yarn-site.xml', 'r')
            lines = yarn_site_file.readlines()
            yarn_site_file.close()

            total_mem_str = subprocess.check_output(
                ['grep', 'MemTotal', '/proc/meminfo'])
            total_free_mem = int(total_mem_str.split()[1]) / 1048

            if nodelist.__len__() == 1:
                cores_used = cores / 2
                total_mem = total_free_mem * 0.75
            else:
                cores_used = cores * (len(nodelist) - 1)
                total_mem = total_free_mem * (len(nodelist) - 1)
                slaves = open(os.getcwd() + '/hadoop/etc/hadoop/slaves', 'w')
                for node in nodelist[1:]:
                    slaves.write('%s\n' % (node + hostname))
                slaves.close()
                master = open(os.getcwd() + '/hadoop/etc/hadoop/masters', 'w')
                master.write('%s\n' % (nodelist[0] + hostname))
                master.close()

            max_app_mem = total_mem / cores_used

            prop_str = ' <property>\n'
            prop_str += '  <name>yarn.nodemanager.aux-services</name>\n'
            prop_str += '    <value>mapreduce_shuffle</value>\n'
            prop_str += ' </property>\n'

            prop_str += ' <property>\n'
            prop_str += '  <name>yarn.scheduler.maximum-allocation-mb</name>\n'
            prop_str += '   <value>%d</value>\n' % max_app_mem
            prop_str += ' </property>\n'

            prop_str += ' <property>\n'
            prop_str += '  <name>yarn.resourcemanager.hostname</name>\n'
            prop_str += '   <value>%s</value>\n' % (nodelist[0] + hostname)
            prop_str += ' </property>\n'

            prop_str += ' <property>\n'
            prop_str += '  <name>yarn.nodemanager.resource.cpu-vcores</name>\n'
            prop_str += '   <value>%d</value>\n' % cores_used
            prop_str += ' </property>\n'

            prop_str += ' <property>\n'
            prop_str += '  <name>yarn.nodemanager.resource.memory-mb</name>\n'
            prop_str += '   <value>%d</value>\n' % total_mem
            prop_str += ' </property>\n'

            lines.insert(-1, prop_str)

            yarn_site_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/yarn-site.xml', 'w')
            for line in lines:
                yarn_site_file.write(line)
            yarn_site_file.close()

            scheduler_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/capacity-scheduler.xml', 'r')
            lines = scheduler_file.readlines()
            scheduler_file.close()

            for line in lines:
                if line.startswith(
                        '    <value>org.apache.hadoop.yarn.util.resource.'):
                    new_line = '    <value>org.apache.hadoop.yarn.util.resource.' + 'DefaultResourceCalculator</value>\n'
                    lines[lines.index(line)] = new_line
                elif line.startswith(
                        '    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>'
                ):
                    new_line = '    <value>1</value>\n'
                    lines[lines.index(line) + 1] = new_line

            scheduler_file = open(
                os.getcwd() + '/hadoop/etc/hadoop/capacity-scheduler.xml', 'w')
            for line in lines:
                scheduler_file.write(line)

            scheduler_file.close()

        # If the LRMS used is not YARN the namenode url is going to be
        # the first node in the list and the port is the default one, else
        # it is the one that the YARN LRMS returns
        hadoop_home = None
        if lrms.name == 'YARNLRMS':  # FIXME: use constant
            logger.info('Hook called by YARN LRMS')
            logger.info('NameNode: %s', lrms.namenode_url)
            service_url = lrms.namenode_url
            rm_url = "%s:%s" % (lrms.rm_ip, lrms.rm_port)
            rm_ip = lrms.rm_ip
            launch_command = ru.which('yarn')

        else:
            # Here are the necessary commands to start the cluster.
            if lrms.node_list[0] == 'localhost':
                #Download the tar file
                node_name = lrms.node_list[0]
                stat = os.system(
                    "wget http://apache.claz.org/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz"
                )
                stat = os.system(
                    'tar xzf hadoop-2.6.5.tar.gz;mv hadoop-2.6.5 hadoop;rm -rf hadoop-2.6.5.tar.gz'
                )
            else:
                node = subprocess.check_output('/bin/hostname')
                logger.info('Entered Else creation')
                node_name = node.split('\n')[0]
                stat = os.system(
                    "wget http://apache.claz.org/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz"
                )
                stat = os.system(
                    'tar xzf hadoop-2.6.5.tar.gz;mv hadoop-2.6.5 hadoop;rm -rf hadoop-2.6.5.tar.gz'
                )
                # TODO: Decide how the agent will get Hadoop tar ball.

            hadoop_home = os.getcwd() + '/hadoop'
            hadoop_install = hadoop_home
            hadoop_mapred_home = hadoop_home
            hadoop_common_home = hadoop_home
            hadoop_hdfs_home = hadoop_home
            yarn_home = hadoop_home

            hadoop_common_lib_native_dir = hadoop_home + '/lib/native'

            #-------------------------------------------------------------------
            # Solution to find Java's home folder:
            # http://stackoverflow.com/questions/1117398/java-home-directory

            java = ru.which('java')
            if java != '/usr/bin/java':
                jpos = java.split('bin')
            else:
                jpos = os.path.realpath('/usr/bin/java').split('bin')

            if jpos[0].find('jre') != -1:
                java_home = jpos[0][:jpos[0].find('jre')]
            else:
                java_home = jpos[0]

            hadoop_env_file = open(hadoop_home + '/etc/hadoop/hadoop-env.sh',
                                   'r')
            hadoop_env_file_lines = hadoop_env_file.readlines()
            hadoop_env_file.close()
            hadoop_env_file_lines[24] = 'export JAVA_HOME=%s' % java_home
            hadoop_env_file = open(hadoop_home + '/etc/hadoop/hadoop-env.sh',
                                   'w')
            for line in hadoop_env_file_lines:
                hadoop_env_file.write(line)
            hadoop_env_file.close()
            host = node_name.split(lrms.node_list[0])[1]

            config_core_site(node_name)
            config_hdfs_site(lrms.node_list)
            config_mapred_site()
            config_yarn_site(lrms.cores_per_node, lrms.node_list, host)

            logger.info('Start Formatting DFS')
            namenode_format = os.system(hadoop_home +
                                        '/bin/hdfs namenode -format -force')
            logger.info('DFS Formatted. Starting DFS.')
            logger.info('Starting YARN')
            yarn_start = subprocess.check_output(
                [hadoop_home + '/sbin/start-all.sh'])
            if 'Error' in yarn_start:
                raise RuntimeError('Unable to start YARN cluster: %s' \
                    % (yarn_start))
            else:
                logger.info('Started YARN')

            #-------------------------------------------------------------------
            # Creating user's HDFS home folder
            logger.debug('Running: %s/bin/hdfs dfs -mkdir /user' % hadoop_home)
            os.system('%s/bin/hdfs dfs -mkdir /user' % hadoop_home)
            uname = subprocess.check_output('whoami').split('\n')[0]
            logger.debug('Running: %s/bin/hdfs dfs -mkdir /user/%s' %
                         (hadoop_home, uname))
            os.system('%s/bin/hdfs dfs -mkdir /user/%s' % (hadoop_home, uname))
            check = subprocess.check_output(
                ['%s/bin/hdfs' % hadoop_home, 'dfs', '-ls', '/user'])
            logger.info(check)
            logger.info('Getting YARN app')
            os.system(
                'wget https://www.dropbox.com/s/9yxbj9btibgtg40/Pilot-YARN-0.1-jar-with-dependencies.jar'
            )

            # FIXME YARN: why was the scheduler configure called here?  Configure
            #             is already called during scheduler instantiation
            # self._scheduler._configure()

            service_url = node_name + ':54170'
            rm_url = node_name
            launch_command = yarn_home + '/bin/yarn'
            rm_ip = node_name

        # The LRMS instance is only available here -- everything which is later
        # needed by the scheduler or launch method is stored in an 'lm_info'
        # dict.  That lm_info dict will be attached to the scheduler's lrms_info
        # dict, and will be passed around as part of the opaque_slots structure,
        # so it is available on all LM create_command calls.
        lm_info = {
            'service_url': service_url,
            'rm_url': rm_url,
            'hadoop_home': hadoop_home,
            'rm_ip': rm_ip,
            'name': lrms.name,
            'launch_command': launch_command,
            'nodename': lrms.node_list[0]
        }

        return lm_info
예제 #43
0
    def _create_master_entry(self, url, session, prompt, logger):
        # FIXME: cache 'which' results, etc
        # FIXME: check 'which' results

        with self.rlock:
            # if True :

            info = {}

            info['schema'] = url.schema.lower()
            info['host_str'] = url.host
            info['prompt'] = prompt
            info['logger'] = logger
            info['url'] = url
            info['pass'] = ""
            info['key_pass'] = {}
            info['scripts'] = _SCRIPTS

            if not info['schema']:
                info['schema'] = 'local'

            # find out what type of shell we have to deal with
            if info['schema'] in _SCHEMAS_SSH:
                info['type'] = "ssh"
                info['ssh_exe'] = ru.which("ssh")
                info['scp_exe'] = ru.which("scp")
                info['sftp_exe'] = ru.which("sftp")

            elif info['schema'] in _SCHEMAS_GSI:
                info['type'] = "ssh"
                info['ssh_exe'] = ru.which("gsissh")
                info['scp_exe'] = ru.which("gsiscp")
                info['sftp_exe'] = ru.which("gsisftp")

            elif info['schema'] in _SCHEMAS_SH:
                info['type'] = "sh"
                info['sh_args'] = "-i"
                info['sh_env'] = "/usr/bin/env TERM=vt100 PS1='PROMPT-$?->'"
                info['cp_env'] = "/usr/bin/env TERM=vt100 PS1='PROMPT-$?->'"
                info['fs_root'] = "/"

                if "SHELL" in os.environ:
                    info['sh_exe'] = ru.which(os.environ["SHELL"])
                    info['cp_exe'] = ru.which("cp")
                else:
                    info['sh_exe'] = ru.which("sh")
                    info['cp_exe'] = ru.which("cp")

            else:
                raise se.BadParameter._log (self.logger, \
                          "cannot handle schema '%s://'" % url.schema)

            # depending on type, create command line (args, env etc)
            #
            # We always set term=vt100 to avoid ansi-escape sequences in the prompt
            # and elsewhere.  Also, we have to make sure that the shell is an
            # interactive login shell, so that it interprets the users startup
            # files, and reacts on commands.

            try:
                info['latency'] = sumisc.get_host_latency(url)

                # FIXME: note that get_host_latency is considered broken (see
                # saga/utils/misc.py line 73), and will return a constant 250ms.

            except Exception as e:
                info['latency'] = 1.0  # generic value assuming slow link
                info['logger'].warning("Could not contact host '%s': %s" %
                                       (url, e))

            if info['type'] == "sh":

                info[
                    'sh_env'] = "/usr/bin/env TERM=vt100 "  # avoid ansi escapes

                if not sumisc.host_is_local(url.host):
                    raise se.BadParameter._log (self.logger, \
                            "expect local host for '%s://', not '%s'" % (url.schema, url.host))

                if 'user' in info and info['user']:
                    pass
                else:
                    info['user'] = getpass.getuser()

            else:
                info[
                    'ssh_env'] = "/usr/bin/env TERM=vt100 "  # avoid ansi escapes
                info[
                    'scp_env'] = "/usr/bin/env TERM=vt100 "  # avoid ansi escapes
                info[
                    'sftp_env'] = "/usr/bin/env TERM=vt100 "  # avoid ansi escapes
                info['ssh_args'] = "-t "  # force pty
                info['scp_args'] = ""
                info['sftp_args'] = ""

                if session:

                    for context in session.contexts:

                        # ssh can also handle UserPass contexts, and ssh type contexts.
                        # gsissh can handle the same, but also X509 contexts.

                        if context.type.lower() == "ssh":
                            if info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI:

                                if context.attribute_exists(
                                        "user_id") and context.user_id:
                                    info['user'] = context.user_id

                                if context.attribute_exists(
                                        "user_key") and context.user_key:
                                    info[
                                        'ssh_args'] += "-o IdentityFile=%s " % context.user_key
                                    info[
                                        'scp_args'] += "-o IdentityFile=%s " % context.user_key
                                    info[
                                        'sftp_args'] += "-o IdentityFile=%s " % context.user_key

                                    if context.attribute_exists(
                                            "user_pass") and context.user_pass:
                                        info['key_pass'][
                                            context.
                                            user_key] = context.user_pass

                        if context.type.lower() == "userpass":
                            if info['schema'] in _SCHEMAS_SSH + _SCHEMAS_GSI:
                                if context.attribute_exists(
                                        "user_id") and context.user_id:
                                    info['user'] = context.user_id
                                if context.attribute_exists(
                                        "user_pass") and context.user_pass:
                                    info['pass'] = context.user_pass

                        if context.type.lower() == "x509":
                            if info['schema'] in _SCHEMAS_GSI:

                                if context.attribute_exists(
                                        "user_proxy") and context.user_proxy:
                                    info[
                                        'ssh_env'] += "X509_USER_PROXY='%s' " % context.user_proxy
                                    info[
                                        'scp_env'] += "X509_USER_PROXY='%s' " % context.user_proxy
                                    info[
                                        'sftp_env'] += "X509_USER_PROXY='%s' " % context.user_proxy

                                if context.attribute_exists(
                                        "user_cert") and context.user_cert:
                                    info[
                                        'ssh_env'] += "X509_USER_CERT='%s' " % context.user_cert
                                    info[
                                        'scp_env'] += "X509_USER_CERT='%s' " % context.user_cert
                                    info[
                                        'sftp_env'] += "X509_USER_CERT='%s' " % context.user_cert

                                if context.attribute_exists(
                                        "user_key") and context.user_key:
                                    info[
                                        'ssh_env'] += "X509_USER_key='%s' " % context.user_key
                                    info[
                                        'scp_env'] += "X509_USER_key='%s' " % context.user_key
                                    info[
                                        'sftp_env'] += "X509_USER_key='%s' " % context.user_key

                                if context.attribute_exists(
                                        "cert_repository"
                                ) and context.cert_repository:
                                    info[
                                        'ssh_env'] += "X509_CERT_DIR='%s' " % context.cert_repository
                                    info[
                                        'scp_env'] += "X509_CERT_DIR='%s' " % context.cert_repository
                                    info[
                                        'sftp_env'] += "X509_CERT_DIR='%s' " % context.cert_repository

                if url.port and url.port != -1:
                    info['ssh_args'] += "-p %d " % int(url.port)
                    info['scp_args'] += "-p %d " % int(url.port)
                    info['sftp_args'] += "-P %d " % int(url.port)

                # all ssh based shells allow for user_id and user_pass from contexts
                # -- but the data given in the URL take precedence

                if url.username: info['user'] = url.username
                if url.password: info['pass'] = url.password

                ctrl_user = pwd.getpwuid(os.getuid()).pw_name
                ctrl_base = "/tmp/saga_ssh_%s" % ctrl_user

                if 'user' in info and info['user']:
                    info['host_str'] = "%s@%s" % (info['user'],
                                                  info['host_str'])
                    info['ctrl'] = "%s_%%h_%%p.%s.ctrl" % (ctrl_base,
                                                           info['user'])
                else:
                    info['user'] = getpass.getuser()
                    info['ctrl'] = "%s_%%h_%%p.ctrl" % (ctrl_base)

                info['m_flags'] = _SSH_FLAGS_MASTER % ({'ctrl': info['ctrl']})
                info['s_flags'] = _SSH_FLAGS_SLAVE % ({'ctrl': info['ctrl']})
                info['fs_root'] = url

                info['fs_root'].path = "/"

            # keep all collected info in the master dict, and return it for
            # registration
            return info
예제 #44
0
    def lrms_config_hook(cls, name, cfg, lrms, logger, profile):

        import radical.utils as ru

        if not os.environ.get('SPARK_HOME'):
            logger.info("Downloading Apache Spark..")
            try:

                VERSION = "2.0.2"
                subprocess.check_call(
                    "wget http://d3kbcqa49mib13.cloudfront.net/spark-2.0.2-bin-hadoop2.7.tgz"
                    .split())
                subprocess.check_call(
                    'tar -xzf spark-2.0.2-bin-hadoop2.7.tgz'.split())
                subprocess.check_call(
                    "rm spark-2.0.2-bin-hadoop2.7.tgz ".split())
                subprocess.check_call(
                    ("mv spark-2.0.2-bin-hadoop2.7 spark-" + VERSION).split())
            except Exception as e:
                raise RuntimeError(
                    "Spark wasn't installed properly. Please try again. %s " %
                    e)
            spark_home = os.getcwd() + '/spark-' + VERSION
        else:
            spark_home = os.environ['SPARK_HOME']

        #-------------------------------------------------------------------
        platform_os = sys.platform
        java_home = os.environ.get('JAVA_HOME')

        if platform_os == "linux" or platform_os == "linux2":
            if not java_home:
                java = ru.which('java')
                if java != '/usr/bin/java':
                    jpos = java.split('bin')
                else:
                    jpos = os.path.realpath('/usr/bin/java').split('bin')

                if jpos[0].find('jre') != -1:
                    java_home = jpos[0][:jpos[0].find('jre')]
                else:
                    java_home = jpos[0]
        else:
            if not java_home:
                try:
                    java_home = subprocess.check_output(
                        "/usr/libexec/java_home").split()[0]
                except Exception:
                    java_home = '/Library/Java/Home'

        spark_conf_slaves = open(spark_home + "/conf/slaves", 'w')

        if len(lrms.node_list) == 1:
            spark_conf_slaves.write(lrms.node_list[0])  #+hostname)
            spark_conf_slaves.write('\n')
        else:
            for nodename in lrms.node_list[1:]:
                spark_conf_slaves.write(nodename)  # +hostname)
                spark_conf_slaves.write('\n')

        spark_conf_slaves.close()

        ## put Master Ip in spark-env.sh file -

        if len(lrms.node_list) == 1:
            master_ip = lrms.node_list[0]
        else:
            try:
                master_ip = subprocess.check_output(
                    'hostname -f'.split()).strip()
            except Exception as e:
                raise RuntimeError("Master ip couldn't be detected. %s" % e)

        #Setup default env properties:
        spark_default_file = open(spark_home + "/conf/spark-defaults.conf",
                                  'w')
        spark_master_string = 'spark://%s:7077' % master_ip
        spark_default_file.write('spark.master  ' + spark_master_string + '\n')
        spark_default_file.close()
        logger.info("Let's print the config")
        logger.info('Config : {0}'.format(cfg['resource_cfg']))

        spark_env_file = open(spark_home + "/conf/spark-env.sh", 'w')
        #load in the spark enviroment of master and slaves the
        #configurations of the machine
        if master_ip != 'localhost':
            for config in cfg['resource_cfg']['pre_bootstrap_1']:
                spark_env_file.write(config + '\n')

        spark_env_file.write('export SPARK_MASTER_HOST=' + master_ip + "\n")
        spark_env_file.write('export JAVA_HOME=' + java_home + "\n")
        spark_env_file.write('export SPARK_LOG_DIR=' + os.getcwd() +
                             '/spark-logs' + '\n')
        #spark_env_file.write('export PYSPARK_PYTHON=`which python` \n')
        spark_env_file.close()

        #### Start spark Cluster
        try:
            subprocess.check_output(spark_home + '/sbin/start-all.sh')
        except Exception as e:
            raise RuntimeError("Spark Cluster failed to start: %s" % e)

        logger.info('Start Spark Cluster')
        launch_command = spark_home + '/bin'

        # The LRMS instance is only available here -- everything which is later
        # needed by the scheduler or launch method is stored in an 'lm_info'
        # dict.  That lm_info dict will be attached to the scheduler's lrms_info
        # dict, and will be passed around as part of the opaque_slots structure,
        # so it is available on all LM create_command calls.
        lm_info = {
            'spark_home': spark_home,
            'master_ip': master_ip,
            'lm_detail': spark_master_string,
            'name': lrms.name,
            'launch_command': launch_command,
            'nodename': lrms.node_list[0]
        }

        return lm_info
예제 #45
0
 def _configure(self):
     # aprun: job launcher for Cray systems
     self.launch_command = ru.which('aprun')
예제 #46
0
    def _configure(self):

        self.launch_command = ru.which('orterun')

        # Request to create a background asynchronous event loop
        os.putenv("OMPI_MCA_ess_tool_async_progress", "enabled")
예제 #47
0
    def _configure(self):

        # dplace: job launcher for SGI systems (e.g. on Blacklight)
        self.launch_command = ru.which('dplace')
예제 #48
0
 def _configure(self):
     # ccmrun: Cluster Compatibility Mode (CCM) job launcher for Cray systems
     self.launch_command = ru.which('ccmrun')
예제 #49
0
    def _configure(self):

        # runjob: job launcher for IBM BG/Q systems, e.g. Joule
        self.launch_command = ru.which('runjob')

        raise NotImplementedError('RUNJOB LM still coupled to scheduler/LRMS')