Пример #1
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job,  $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ':' in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(','):
            site, job_id = job.split(':')
            hosts += get_oar_job_nodes(int(job_id), site)
    else:
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    logger.debug(
        'Hosts list: \n%s',
        ' '.join(style.host(host.address.split('.')[0]) for host in hosts))
    return hosts
Пример #2
0
def concretize_resources(resources, gridjob, reservation_type):
    if reservation_type == "oar":
        nodes = ex5.get_oar_job_nodes(gridjob)
    else:
        nodes = ex5.get_oargrid_job_nodes(gridjob)

    concretize_nodes(resources, nodes)

    if reservation_type == "oar":
        # This block is in charge of detecting the site of the oar reservation
        site_candidates = []
        for network_description in resources.get("machines", []):
            cluster = network_description.get("cluster")
            site_candidates += [ex5.get_cluster_site(cluster)]
        for network_description in resources.get("networks", []):
            site_candidates += [network_description.get("site", "unknown")]
        if len(set(site_candidates)) == 1:
            site = site_candidates[0]
        else:
            raise "Could not detect the g5k site of the oarjob %s" % gridjob
        job_sites = [(gridjob, site)]
    else:
        job_sites = ex5.get_oargrid_job_oar_jobs(gridjob)
    vlans = []
    for (job_id, site) in job_sites:
        vlan_ids = ex5.get_oar_job_kavlan(job_id, site)
        vlans.extend([{
            "site": site,
            "vlan_id": vlan_id
        } for vlan_id in vlan_ids])

    concretize_networks(resources, vlans)
Пример #3
0
    def _get_jobs_and_vlans(self, conf):
        """Get the hosts from an existing job (if any) or from a new job.
        This will perform a reservation if necessary."""

        provider_conf = conf['provider']
        # Look if there is a running job or make a new reservation
        gridjob, _ = EX5.planning.get_job_by_name(provider_conf['name'])

        if gridjob is None:
            gridjob = self._make_reservation(conf)
        else:
            logging.info("Using running oargrid job %s" % gridjob)

        # Wait for the job to start
        EX5.wait_oargrid_job_start(gridjob)

        nodes = sorted(EX5.get_oargrid_job_nodes(gridjob),
                       key=lambda n: n.address)

        # Checking the number of nodes given
        # the disribution policy
        self._check_nodes(nodes=nodes,
                          resources=conf['resources'],
                          mode=provider_conf['role_distribution'])

        # vlans information
        job_sites = EX5.get_oargrid_job_oar_jobs(gridjob)
        jobs = []
        vlans = []
        for (job_id, site) in job_sites:
            jobs.append((site, job_id))
            vlan_id = EX5.get_oar_job_kavlan(job_id, site)
            if vlan_id is not None:
                vlans.append((site, EX5.get_oar_job_kavlan(job_id, site)))
        return (jobs, vlans, nodes)
Пример #4
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job,  $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ':' in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(','):
            site, job_id = job.split(':')
            hosts += get_oar_job_nodes(int(job_id), site)
    else:
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    logger.debug('Hosts list: \n%s',
                 ' '.join(style.host(host.address.split('.')[0])
                          for host in hosts))
    return hosts
Пример #5
0
def grid_reload_from_id(gridjob):
    logger.info("Reloading the resources from oargrid job %s", gridjob)
    gridjob = int(gridjob)
    nodes = ex5.get_oargrid_job_nodes(gridjob)

    job_sites = ex5.get_oargrid_job_oar_jobs(gridjob)
    vlans = []
    subnets = []
    for (job_id, site) in job_sites:
        vlan_ids = ex5.get_oar_job_kavlan(job_id, site)
        vlans.extend([{
            "site": site,
            "vlan_id": vlan_id
        } for vlan_id in vlan_ids])
        # NOTE(msimonin): this currently returned only one subnet
        # even if several are reserved
        # We'll need to patch execo the same way it has been patched for vlans
        ipmac, info = ex5.get_oar_job_subnets(job_id, site)
        if not ipmac:
            logger.debug("No subnet information found for this job")
            continue
        subnet = {
            "site": site,
            "ipmac": ipmac,
        }
        subnet.update(info)
        # Mandatory key when it comes to concretize resources
        subnet.update({"network": info["ip_prefix"]})
        subnets.append(subnet)
    return nodes, vlans, subnets
Пример #6
0
def make_reservation(job_name=JOB_NAME, job_type='allow_classic_ssh'):
    plan = ex5.planning
    end = ex.time_utils.format_date(time.time()+12600)

    logging.basicConfig(level=logging.DEBUG)
    oargrid_job_id, _ = ex5.planning.get_job_by_name(job_name)
    if oargrid_job_id is None:
        logging.info("Starting a new job")
        planning = plan.get_planning(endtime=end)
        slots = plan.compute_slots(planning, walltime=WALLTIME, excluded_elements=excluded)
        startdate, enddate, resources = plan.find_free_slot(slots, {'grid5000':1})
        logging.info("startdate = %s, enddate = %s resources = %s" % (startdate, enddate, resources))
        resources = plan.distribute_hosts(resources, {'grid5000':1}, excluded_elements=excluded)
        # shuffling to load balance load accros nodes
        random.shuffle(resources)
        specs = plan.get_jobs_specs(resources, excluded_elements=excluded)
        spec, frontend = specs[0]
        spec.name = job_name
        logging.info("specs = %s" % spec)
        oargrid_job_id, _ = ex5.oargridsub(specs, job_type=job_type, walltime=WALLTIME)

    logging.info("Using running oargrid job %s" % oargrid_job_id)

    jobs = ex5.oargrid.get_oargrid_job_oar_jobs(oargrid_job_id=oargrid_job_id)
    # Get the frontend
    _, frontend = jobs[0]
    # Get the host
    hosts = ex5.get_oargrid_job_nodes(oargrid_job_id)
    logging.info("The slave will be running on %s,%s" % (hosts[0], frontend))
    return hosts[0], frontend
Пример #7
0
def grid_reload_from_id(gridjob):
    logger.info("Reloading the resources from oargrid job %s", gridjob)
    gridjob = int(gridjob)
    nodes = ex5.get_oargrid_job_nodes(gridjob)

    job_sites = ex5.get_oargrid_job_oar_jobs(gridjob)
    vlans = []
    subnets = []
    for (job_id, site) in job_sites:
        vlans, subnets = get_network_info_from_job_id(job_id, site, vlans,
                                                      subnets)
    return nodes, vlans, subnets
Пример #8
0
    def _get_job(self):
        """Get the hosts from an existing job (if any) or from a new job.
        This will perform a reservation if necessary."""

        # Look if there is a running job or make a new reservation
        self.gridjob, _ = EX5.planning.get_job_by_name(self.config['name'])

        if self.gridjob is None:
            self._make_reservation()
        else:
            logging.info("Using running oargrid job %s" % self.gridjob)

        # Wait for the job to start
        EX5.wait_oargrid_job_start(self.gridjob)

        # # XXX Still useful?
        # attempts = 0
        # self.nodes = None
        # while self.nodes is None and attempts < MAX_ATTEMPTS:
        #     self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob),
        #                             key = lambda n: n.address)
        #     attempts += 1

        self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob),
                            key=lambda n: n.address)

        # # XXX check already done into `_deploy`.
        self._check_nodes(nodes=self.nodes,
                          resources=self.config['resources'],
                          mode=self.config['role_distribution'])

        # XXX(Ad_rien_) Start_date is never used, deadcode? - August
        # 11th 2016
        self.start_date = None
        job_info = EX5.get_oargrid_job_info(self.gridjob)
        if 'start_date' in job_info:
            self.start_date = job_info['start_date']

        # filling some information about the jobs here
        self.user = None
        job_info = EX5.get_oargrid_job_info(self.gridjob)
        if 'user' in job_info:
            self.user = job_info['user']

        # vlans information
        job_sites = EX5.get_oargrid_job_oar_jobs(self.gridjob)
        self.jobs = []
        self.vlans = []
        for (job_id, site) in job_sites:
            self.jobs.append((site, job_id))
            vlan_id = EX5.get_oar_job_kavlan(job_id, site)
            if vlan_id is not None:
                self.vlans.append((site, EX5.get_oar_job_kavlan(job_id, site)))
Пример #9
0
def concretize_resources(resources, gridjob):
    nodes = ex5.get_oargrid_job_nodes(gridjob)
    concretize_nodes(resources, nodes)

    job_sites = ex5.get_oargrid_job_oar_jobs(gridjob)
    vlans = []
    for (job_id, site) in job_sites:
        vlan_ids = ex5.get_oar_job_kavlan(job_id, site)
        vlans.extend([{
            "site": site,
            "vlan_id": vlan_id} for vlan_id in vlan_ids])

    concretize_networks(resources, vlans)
Пример #10
0
    def get_job(self):
        """Get the hosts from an existing job (if any) or from a new job.
        This will perform a reservation if necessary."""

        # Look if there is a running job or make a new reservation
        self.gridjob, _ = EX5.planning.get_job_by_name(self.config['name'])

        if self.gridjob is None:
            self._make_reservation()
        else:
            logger.info("Using running oargrid job %s" % style.emph(self.gridjob))
            

        # Wait for the job to start
        EX5.wait_oargrid_job_start(self.gridjob)

        attempts = 0
        self.nodes = None
        while self.nodes is None and attempts < MAX_ATTEMPTS:
            self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob),
                                    key = lambda n: n.address)
            attempts += 1

        check_nodes(
                nodes = self.nodes,
                resources = self.config['resources'],
                mode = self.config['role_distribution'])

        # TODO - Start_date is never used, deadcode ? Ad_rien_ - August 11th 2016
        self.start_date = None
        job_info = EX5.get_oargrid_job_info(self.gridjob)
        if 'start_date' in job_info:
            self.start_date = job_info['start_date']

        ## filling some information about the jobs here
        self.user = None
        job_info = EX5.get_oargrid_job_info(self.gridjob)
        if 'user' in job_info:
            self.user = job_info['user']

        ## vlans information
        job_sites = EX5.get_oargrid_job_oar_jobs(self.gridjob)
        self.jobs = []
        self.vlans = []
        for (job_id, site) in job_sites:
            self.jobs.append((site, job_id))
            vlan_id = EX5.get_oar_job_kavlan(job_id, site)
            if vlan_id is not None:
                self.vlans.append((site, EX5.get_oar_job_kavlan(job_id, site)))

        return self.gridjob
Пример #11
0
    def get_hosts_list(self, hosts_str):
        """Generate a list of hosts from the given string.

        Args:
          hosts_str (str): The following options are supported

            - The path of the file containing the hosts to be used. Each host
            should be in a different line. Repeated hosts are pruned.
            Hint: in a running Grid5000 job, $OAR_NODEFILE should be used.

            - A comma-separated list of  site:job_id

            - A comma-separated list of hosts.

            - An oargrid_job_id

        Return:
          list of Host: The list of hosts.
        """
        hosts = []
        if os.path.isfile(hosts_str):
            for line in open(hosts_str):
                h = Host(line.rstrip())
                if h not in hosts:
                    hosts.append(h)
        elif ':' in hosts_str:
            # We assume the string is a comma separated list of site:job_id
            for job in hosts_str.split(','):
                site, job_id = job.split(':')
                hosts += get_oar_job_nodes(int(job_id), site)
        elif "," in hosts_str:
            # We assume the string is a comma separated list of hosts
            for hstr in hosts_str.split(','):
                h = Host(hstr.rstrip())
                if h not in hosts:
                    hosts.append(h)
        elif hosts_str.isdigit():
            # If the file_name is a number, we assume this is a oargrid_job_id
            hosts = get_oargrid_job_nodes(int(hosts_str))
        else:
            # If not any of the previous, we assume is a single-host cluster
            # where the given input is the only host
            hosts = [Host(hosts_str.rstrip())]

        logger.debug('Hosts list: \n%s',
                     ' '.join(style.host(host.address.split('.')[0])
                              for host in hosts))
        return hosts
Пример #12
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an a comma separated list
        of hosts or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job, $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ":" in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(","):
            site, job_id = job.split(":")
            hosts += get_oar_job_nodes(int(job_id), site)
    elif "," in hosts_input:
        # We assume the string is a comma separated list of hosts
        for hstr in hosts_input.split(","):
            h = Host(hstr.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif hosts_input.isdigit():
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    else:
        # If not any of the previous, we assume is a single-host cluster where
        # the given input is the only host
        hosts = [Host(hosts_input.rstrip())]

    logger.debug("Hosts list: \n%s", " ".join(style.host(host.address.split(".")[0]) for host in hosts))
    return hosts
Пример #13
0
    def run(self):
        """ """
        if self.options.oargrid_job_id:
            self.oargrid_job_id = self.options.oargrid_job_id
        else:
            self.oargrid_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oargrid_job_id is None:
                    self.make_reservation()
                # Wait that the job starts
                logger.info('Waiting that the job start')
                wait_oargrid_job_start(self.oargrid_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oargrid_job_nodes(self.oargrid_job_id)
                # Hosts deployment and configuration

                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                deployment = Deployment(
                    hosts=self.hosts,
                    env_file='/home/sirimie/env/mywheezy-x64-base.env')
                self.hosts, _ = deploy(deployment)

                Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*",
                       self.hosts).run()
                Remote(
                    "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml",
                    self.hosts).run()
                Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml",
                       self.hosts).run()

                Put(self.hosts, [
                    "run_all_execo.py", "xml_gen_execo.py", "conf.xml",
                    "platform_aws.xml", "cloud_ec2.xml"
                ],
                    remote_location="/home/Work/sgcbntier/paasage_demo/").run(
                    )
                logger.info("Done")

                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = [
                    host for host in self.hosts for i in range(
                        get_host_attributes(host)['architecture']['smt_size'])
                ]

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/csv_results'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]

                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oargrid_job_id = None

        finally:
            if self.oargrid_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oargriddel([self.oargrid_job_id])
                else:
                    logger.info('Keeping job alive for debugging')