Пример #1
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job,  $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ':' in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(','):
            site, job_id = job.split(':')
            hosts += get_oar_job_nodes(int(job_id), site)
    else:
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    logger.debug(
        'Hosts list: \n%s',
        ' '.join(style.host(host.address.split('.')[0]) for host in hosts))
    return hosts
Пример #2
0
def get_oargrid_job_nodes(oargrid_job_id,
                          frontend_connection_params=None,
                          timeout=False):
    """Return an iterable of `execo.host.Host` containing the hosts of an oargrid job.

    :param oargrid_job_id: the oargrid job id.

    :param frontend_connection_params: connection params for connecting
      to frontends if needed. Values override those in
      `execo_g5k.config.default_frontend_connection_params`.

    :param timeout: timeout for retrieving. Default is False, which
      means use
      ``execo_g5k.config.g5k_configuration['default_timeout']``. None
      means no timeout.
    """
    if isinstance(timeout, bool) and timeout == False:
        timeout = g5k_configuration.get('default_timeout')
    process = get_process(
        "oargridstat -wl %i 2>/dev/null || oargridstat -l %i 2>/dev/null" %
        (oargrid_job_id, oargrid_job_id),
        host=get_frontend_host(),
        connection_params=make_connection_params(
            frontend_connection_params, default_frontend_connection_params))
    process.timeout = timeout
    process.pty = True
    process.run()
    if process.ok:
        host_addresses = re.findall("(\S+)", process.stdout, re.MULTILINE)
        return list(
            set([Host(host_address) for host_address in host_addresses]))
    else:
        raise ProcessesFailed([process])
Пример #3
0
 def test_build_roles_less_deployed_nodes(self):
     self.engine.deployed_nodes = map(lambda x: Host(x),
                                      ["a-1", "a-2", "a-3", "a-4", "a-5"])
     roles = self.engine.build_roles()
     self.assertEquals(1, len(roles["controller"]))
     self.assertEquals(1, len(roles["storage"]))
     self.assertEquals(1, len(roles["compute"]))
     self.assertEquals(1, len(roles["network"]))
     self.assertEquals(1, len(roles["util"]))
Пример #4
0
def get_frontend_host(frontend=None):
    """Given a frontend name, or None, and based on the global configuration, returns the frontend to connect to or None."""
    if frontend == None:
        frontend = get_default_frontend()
    if g5k_configuration.get('no_ssh_for_local_frontend'
                             ) == True and frontend == get_default_frontend():
        frontend = None
    if frontend:
        frontend = Host(frontend)
    return frontend
Пример #5
0
def get_oar_job_nodes(oar_job_id=None,
                      frontend=None,
                      frontend_connection_params=None,
                      timeout=False):
    """Return an iterable of `execo.host.Host` containing the hosts of an oar job.

    This method waits for the job start (the list of nodes isn't fixed
    until the job start).

    :param oar_job_id: the oar job id. If None given, will try to get
      it from ``OAR_JOB_ID`` environment variable.

    :param frontend: the frontend of the oar job. If None given, use
      default frontend.

    :param frontend_connection_params: connection params for connecting
      to frontends if needed. Values override those in
      `execo_g5k.config.default_frontend_connection_params`.

    :param timeout: timeout for retrieving. Default is False, which
      means use
      ``execo_g5k.config.g5k_configuration['default_timeout']``. None
      means no timeout.
    """
    if isinstance(timeout, bool) and timeout == False:
        timeout = g5k_configuration.get('default_timeout')
    if oar_job_id == None:
        if 'OAR_JOB_ID' in os.environ:
            oar_job_id = os.environ['OAR_JOB_ID']
        else:
            raise ValueError(
                "no oar job id given and no OAR_JOB_ID environment variable found"
            )
    countdown = Timer(timeout)
    wait_oar_job_start(oar_job_id, frontend, frontend_connection_params,
                       countdown.remaining())
    process = get_process(
        "(oarstat -sj %(oar_job_id)i | grep 'Running\|Terminated\|Error') > /dev/null 2>&1 && oarstat -pj %(oar_job_id)i | oarprint host -f -"
        % {'oar_job_id': oar_job_id},
        host=get_frontend_host(frontend),
        connection_params=make_connection_params(
            frontend_connection_params, default_frontend_connection_params))
    process.timeout = countdown.remaining()
    process.shell = process.pty = True
    process.run()
    if process.ok:
        host_addresses = re.findall("(\S+)", process.stdout, re.MULTILINE)
        return [Host(host_address) for host_address in host_addresses]
    else:
        raise ProcessesFailed([process])
Пример #6
0
 def test_build_roles_with_multiple_clusters(self):
     self.engine.config = {
         "resources": {
             "a": {
                 "controller": 1,
                 "compute": 2,
                 "network": 1,
                 "storage": 1,
                 "util": 1
             },
             "b": {
                 "compute": 2
             }
         }
     }
     self.engine.deployed_nodes = map(
         lambda x: Host(x),
         ["a-1", "a-2", "a-3", "a-4", "a-5", "a-6", "b-1", "b-2"])
     roles = self.engine.build_roles()
     self.assertEquals(1, len(roles["controller"]))
     self.assertEquals(1, len(roles["storage"]))
     self.assertEquals(4, len(roles["compute"]))
     self.assertEquals(1, len(roles["network"]))
     self.assertEquals(1, len(roles["util"]))
Пример #7
0
def deploy(deployment,
           check_deployed_command=True,
           node_connection_params={'user': '******'},
           num_tries=1,
           check_enough_func=None,
           frontend_connection_params=None,
           deploy_timeout=None,
           check_timeout=30,
           stdout_handlers=None,
           stderr_handlers=None):
    """Deploy nodes, many times if needed, checking which of these nodes are already deployed with a user-supplied command. If no command given for checking if nodes deployed, rely on kadeploy to know which nodes are deployed.

    - loop `num_tries` times:

      - if ``check_deployed_command`` given, try to connect to these
        hosts using the supplied `node_connection_params` (or the
        default ones), and to execute ``check_deployed_command``. If
        connection succeeds and the command returns 0, the host is
        assumed to be deployed, else it is assumed to be undeployed.

      - optionnaly call user-supplied ``check_enough_func``, passing
        to it the list of deployed and undeployed hosts, to let user
        code decide if enough nodes deployed. Otherwise, try as long
        as there are undeployed nodes.

      - deploy the undeployed nodes

    returns a tuple with the list of deployed hosts and the list of
    undeployed hosts.

    When checking correctly deployed nodes with
    ``check_deployed_command``, and if the deployment is using the
    kavlan option, this function will try to contact the nodes using
    the appropriate DNS hostnames in the new vlan.

    :param deployment: instance of `execo.kadeploy.Deployment` class
      describing the intended kadeployment.

    :param check_deployed_command: command to perform remotely to
      check node deployement. May be a String, True, False or None. If
      String: the actual command to be used (This command should
      return 0 if the node is correctly deployed, or another value
      otherwise). If True, the default command value will be used
      (from `execo_g5k.config.g5k_configuration`). If None or False,
      no check is made and deployed/undeployed status will be taken
      from kadeploy's output.

    :param node_connection_params: a dict similar to
      `execo.config.default_connection_params` whose values will
      override those in `execo.config.default_connection_params` when
      connecting to check node deployment with
      ``check_deployed_command`` (see below).

    :param num_tries: number of deploy tries

    :param check_enough_func: a function taking as parameter a list of
      deployed hosts and a list of undeployed hosts, which will be
      called at each deployment iteration end, and that should return
      a boolean indicating if there is already enough nodes (in this
      case, no further deployement will be attempted).

    :param frontend_connection_params: connection params for connecting
      to frontends if needed. Values override those in
      `execo_g5k.config.default_frontend_connection_params`.

    :param deploy_timeout: timeout for deployement. Default is None,
      which means no timeout.

    :param check_timeout: timeout for node deployment checks. Default
      is 30 seconds.

    :param stdout_handlers: iterable of `ProcessOutputHandlers`
          which will be passed to the actual deploy processes.

    :param stderr_handlers: iterable of `ProcessOutputHandlers`
          which will be passed to the actual deploy processes.
    """

    if check_enough_func == None:
        check_enough_func = lambda deployed, undeployed: len(undeployed) == 0

    if check_deployed_command == True:
        check_deployed_command = g5k_configuration.get(
            'check_deployed_command')

    def check_update_deployed(undeployed_hosts, check_deployed_command,
                              node_connection_params, vlan):  #IGNORE:W0613
        logger.debug(
            style.emph("check which hosts are already deployed among:") +
            " %s", undeployed_hosts)
        deployment_hostnames_mapping = dict()
        if vlan:
            for host in undeployed_hosts:
                deployment_hostnames_mapping[get_kavlan_host_name(host,
                                                                  vlan)] = host
        else:
            for host in undeployed_hosts:
                deployment_hostnames_mapping[host] = host
        deployed_check = get_remote(check_deployed_command,
                                    list(deployment_hostnames_mapping),
                                    connection_params=node_connection_params)
        for p in deployed_check.processes:
            p.nolog_exit_code = True
            p.nolog_timeout = True
            p.nolog_error = True
            p.timeout = check_timeout
        deployed_check.run()
        newly_deployed = list()
        for process in deployed_check.processes:
            logger.debug(
                style.emph("check on %s:" %
                           (process.host, )) + " %s\n" % (process, ) +
                style.emph("stdout:") + "\n%s\n" % (process.stdout) +
                style.emph("stderr:") + "\n%s\n" % (process.stderr))
            if (process.ok):
                newly_deployed.append(
                    deployment_hostnames_mapping[process.host.address])
                logger.debug(
                    "OK %s",
                    deployment_hostnames_mapping[process.host.address])
            else:
                logger.debug(
                    "KO %s",
                    deployment_hostnames_mapping[process.host.address])
        return newly_deployed

    start_time = time.time()
    deployed_hosts = set()
    undeployed_hosts = set([Host(host).address for host in deployment.hosts])
    my_newly_deployed = []
    if check_deployed_command:
        my_newly_deployed = check_update_deployed(undeployed_hosts,
                                                  check_deployed_command,
                                                  node_connection_params,
                                                  deployment.vlan)
        deployed_hosts.update(my_newly_deployed)
        undeployed_hosts.difference_update(my_newly_deployed)
    num_tries_done = 0
    elapsed = time.time() - start_time
    last_time = time.time()
    deploy_stats = list()  # contains tuples ( timestamp,
    #                   num attempted deploys,
    #                   len(kadeployer.deployed_hosts),
    #                   len(my_newly_deployed),
    #                   len(deployed_hosts),
    #                   len(undeployed_hosts )
    deploy_stats.append((elapsed, None, None, len(my_newly_deployed),
                         len(deployed_hosts), len(undeployed_hosts)))
    while (not check_enough_func(deployed_hosts, undeployed_hosts)
           and num_tries_done < num_tries):
        num_tries_done += 1
        logger.debug(
            style.emph("try %i, deploying on:" % (num_tries_done, )) + " %s",
            undeployed_hosts)
        tmp_deployment = copy.copy(deployment)
        tmp_deployment.hosts = undeployed_hosts
        kadeployer = Kadeployer(
            tmp_deployment,
            frontend_connection_params=frontend_connection_params,
            stdout_handlers=stdout_handlers,
            stderr_handlers=stderr_handlers)
        kadeployer.timeout = deploy_timeout
        kadeployer.run()
        my_newly_deployed = []
        if check_deployed_command:
            my_newly_deployed = check_update_deployed(undeployed_hosts,
                                                      check_deployed_command,
                                                      node_connection_params,
                                                      deployment.vlan)
            deployed_hosts.update(my_newly_deployed)
            undeployed_hosts.difference_update(my_newly_deployed)
        else:
            deployed_hosts.update(kadeployer.deployed_hosts)
            undeployed_hosts.difference_update(kadeployer.deployed_hosts)
        logger.debug(
            style.emph("kadeploy reported newly deployed hosts:") + "   %s",
            kadeployer.deployed_hosts)
        logger.debug(
            style.emph("check reported newly deployed hosts:") + "   %s",
            my_newly_deployed)
        logger.debug(
            style.emph("all deployed hosts:") + "     %s", deployed_hosts)
        logger.debug(
            style.emph("still undeployed hosts:") + " %s", undeployed_hosts)
        elapsed = time.time() - last_time
        last_time = time.time()
        deploy_stats.append(
            (elapsed, len(tmp_deployment.hosts),
             len(kadeployer.deployed_hosts), len(my_newly_deployed),
             len(deployed_hosts), len(undeployed_hosts)))

    logger.detail(
        style.emph("deploy finished") + " in %i tries, %s", num_tries_done,
        format_seconds(time.time() - start_time))
    logger.detail(
        "deploy  duration  attempted  deployed     deployed     total     total"
    )
    logger.detail(
        "                  deploys    as reported  as reported  already   still"
    )
    logger.detail(
        "                             by kadeploy  by check     deployed  undeployed"
    )
    logger.detail(
        "---------------------------------------------------------------------------"
    )
    for (deploy_index, deploy_stat) in enumerate(deploy_stats):
        logger.detail(
            "#%-5.5s  %-8.8s  %-9.9s  %-11.11s  %-11.11s  %-8.8s  %-10.10s",
            deploy_index, format_seconds(deploy_stat[0]), deploy_stat[1],
            deploy_stat[2], deploy_stat[3], deploy_stat[4], deploy_stat[5])
    logger.debug(style.emph("deployed hosts:") + " %s", deployed_hosts)
    logger.debug(style.emph("undeployed hosts:") + " %s", undeployed_hosts)

    return (deployed_hosts, undeployed_hosts)
Пример #8
0
 def test_not_enough_nodes(self):
     self.engine.deployed_nodes = map(lambda x: Host(x),
                                      ["a-1", "a-2", "a-3", "a-4"])
     with self.assertRaises(Exception):
         roles = self.engine.build_roles()