示例#1
0
 def terminate(self, params):
     client = common.ssh_connect_with_retry(
         params['host'],
         params['port'],
         params['login'],
         pkey=self._config.get('pkey'),
         key_filename=self._config.get('key_filename')
         or self._config.get('privateKey'),
         login_cmd=params['login_cmd'])
     if 'container_id' in params:
         common.run_docker_command(client,
                                   'rm --force %s' % params['container_id'])
         time.sleep(5)
     exit_status, stdout, stderr = common.run_command(
         client, 'kill -0 -%d' % params['pgid'])
     if exit_status != 0:
         logger.info("exist_status %d: %s", exit_status, stderr.read())
         client.close()
         return
     exit_status, stdout, stderr = common.run_command(
         client, 'kill -9 -%d' % params['pgid'])
     if exit_status != 0:
         logger.info("exist_status %d: %s", exit_status, stderr.read())
         client.close()
         return
     logger.info("successfully terminated")
     client.close()
示例#2
0
    def status(self, task_id, params, get_log=True):
        client = common.ssh_connect_with_retry(
            params['host'],
            params['port'],
            params['login'],
            pkey=self._config.get('pkey'),
            key_filename=self._config.get('key_filename')
            or self._config.get('privateKey'),
            login_cmd=params['login_cmd'])

        if 'container_id' in params:
            exit_status, stdout, stderr = common.run_docker_command(
                client,
                'inspect -f {{.State.Status}} %s' % params['container_id'])
        else:
            exit_status, stdout, stderr = common.run_command(
                client, 'kill -0 -%d' % params['pgid'])

        if get_log:
            common.update_log(task_id, client, params['log_dir'],
                              self._config.get('callback_url'))

        client.close()
        if exit_status != 0:
            return "dead"

        return "running"
示例#3
0
 def terminate(self, params):
     client = common.ssh_connect_with_retry(
         params['server'],
         params['login'],
         self._config['privateKey'],
         login_cmd=params['login_cmd'])
     if 'container_id' in params:
         common.run_docker_command(client, 'rm --force %s' % params['container_id'])
     else:
         exit_status, stdout, stderr = common.run_command(client, 'kill -0 -%d' % params['pgid'])
         if exit_status != 0:
             logger.debug("exist_status %d: %s", exit_status, stderr.read())
             client.close()
             return
         exit_status, stdout, stderr = common.run_command(client, 'kill -9 -%d' % params['pgid'])
         if exit_status != 0:
             logger.debug("exist_status %d: %s", exit_status, stderr.read())
             client.close()
             return
     logger.debug("successfully terminated")
     client.close()
示例#4
0
    def terminate(self, params):
        instance_id = params["instance_id"]
        if params.get('dynamic'):
            nova_client = self._nova_client
            nova_client.servers.delete(instance_id)
        else:
            ssh_client = get_client(params, self._config)
            if 'container_id' in params:
                common.run_docker_command(ssh_client, 'rm --force %s' % params['container_id'])
                time.sleep(5)
            if 'pgid' in params:
                exit_status, _, stderr = common.run_command(ssh_client, 'kill -0 -%d' % params['pgid'])
                if exit_status != 0:
                    logger.info("exist_status %d: %s", exit_status, stderr.read())
                    ssh_client.close()
                    return
                exit_status, _, stderr = common.run_command(client, 'kill -9 -%d' % params['pgid'])
                if exit_status != 0:
                    logger.info("exist_status %d: %s", exit_status, stderr.read())
                    ssh_client.close()
                    return
            ssh_client.close()

        logger.info("Terminated instance (on terminate): %s.", instance_id)
示例#5
0
    def status(self, task_id, params, get_log=True):  # pylint: disable=arguments-differ
        ssh_client = get_client(params, self._config)
        if 'container_id' in params:
            exit_status, _, _ = common.run_docker_command(
                ssh_client, 'inspect -f {{.State.Status}} %s' % params['container_id'])
        else:
            exit_status, _, _ = common.run_command(ssh_client, 'kill -0 -%d' % params['pgid'])

        if get_log:
            common.update_log(task_id, ssh_client, params['log_dir'], self._config.get('callback_url'))

        ssh_client.close()
        if exit_status != 0:
            return "dead"
        return "running"
示例#6
0
    def check(self, options):
        params = _get_params(self._config, options)

        client = common.ssh_connect_with_retry(params['master_node'],
                                               params['login'],
                                               self._config['privateKey'])

        # check log_dir
        if not common.run_and_check_command(
                client, "test -d '%s'" % params['log_dir']):
            client.close()
            raise ValueError("incorrect log directory: %s" % params['log_dir'])

        status, stdout, _ = common.run_command(
            client, os.path.join(params['torque_install_path'], "qstat"))

        client.close()
        if status != 0:
            raise RuntimeError('qstat exited with code %s' % status)
        return "%s jobs(s) in the queue" % (len(stdout.read().split('\n')) - 2)
示例#7
0
    def launch(self, task_id, options, gpulist, resource, storages,
               docker_config, docker_registry, docker_image, docker_tag,
               docker_command, docker_files, wait_after_launch):
        params = _get_params(self._config, options)

        client = common.ssh_connect_with_retry(params['master_node'],
                                               params['login'],
                                               self._config['privateKey'])

        cmd = "cat <<-'EOF'\n"
        cmd += "#!/bin/bash\n"
        cmd += "#PBS -l nodes=1:ppn=2:gpus=1,mem=%sG,walltime=10000:00:00\n" % params[
            'mem']
        cmd += "#PBS -p %d\n" % params['priority']
        cmd += "#PBS -N infTraining\n"
        cmd += "#PBS -o %s/%s.log -j oe\n" % (params['log_dir'], task_id)

        cmd += "guessdevice(){\n"
        cmd += "    if [ -e \"${PBS_GPUFILE}\" ]\n"
        cmd += "    then\n"
        cmd += "        GPUS=`cat ${PBS_GPUFILE} | perl -pe 's/[^-]+-gpu//g' |"
        cmd += " perl -pe 's/\s+/ /g' | perl -pe 's/,$//g'`\n"
        cmd += "        GPUS=`echo \"${GPUS}+1\" | bc `\n"
        cmd += "        echo $GPUS;\n"
        cmd += "    else\n"
        cmd += "        echo \"error: No available GPU\"\n"
        cmd += "    fi\n"
        cmd += "}\n"

        cmd += "DEVICE=$(guessdevice)\n"
        cmd += "echo \"RUN ON GPU ${DEVICE}\"\n"
        registry = docker_config['registries'][docker_registry]
        registry_uri = registry['uri']
        registry_urip = '' if registry_uri == '' else registry_uri + '/'
        image_ref = '%s%s:%s' % (registry_urip, docker_image, docker_tag)

        if registry['type'] != 'dockerhub':
            cmd_connect = common.cmd_connect_private_registry(registry)
            cmd += "echo '=> " + cmd_connect + "'\n"
            cmd += cmd_connect + '\n'

        cmd_docker_pull = common.cmd_docker_pull(image_ref)
        cmd += "echo '=> " + cmd_docker_pull + "'\n"
        cmd += cmd_docker_pull + '\n'
        docker_cmd = "echo | " + common.cmd_docker_run(
            "$DEVICE", docker_config, task_id, image_ref, storages,
            self._config['callback_url'], self._config['callback_interval'],
            docker_command)

        cmd += "echo \"=> " + docker_cmd.replace("\"", "\"") + "\"\n"
        cmd += docker_cmd + '\n'

        if self._config['callback_url']:
            callback_cmd = ''
            if params['log_dir'] is not None and params['log_dir'] != '':
                callback_cmd = 'curl -X POST "%s/log/%s" --data-binary "@%s/%s.log" ; ' % (
                    self._config['callback_url'], task_id, params['log_dir'],
                    task_id)

            callback_cmd += 'curl "%s/terminate/%s?phase=completed"' % (
                self._config['callback_url'], task_id)
            cmd += "echo \"=> " + callback_cmd.replace("\"", "\\\"") + "\"\n"
            cmd += callback_cmd + '\n'

        cmd += "EOF\n"

        qsub_cmd = "echo \"$(%s)\" | %s" % (
            cmd, os.path.join(params['torque_install_path'], "qsub -V"))

        exit_status, stdout, stderr = common.run_command(client, qsub_cmd)
        if exit_status != 0:
            client.close()
            raise RuntimeError('run exited with code %d: %s' %
                               (exit_status, stderr.read()))

        client.close()
        params['model'] = task_id
        params['qsub_id'] = stdout.read().strip()
        return params