Пример #1
0
def is_job_alive(oar_job_ids):
    """Check if the given OAR_JOB_IDs are still alive on Grid5000 system or not

    Parameters
    ----------
    oar_job_ids: dict
        a dictionary that contains the reserved information 
        key: str, the name of the site on Grid5000 system
        value: int, the number of the reservation on that site

    Returns
    ------
    bool
        True: if the given oar_job_ids is still alive
        False: if  the given oar_job_ids is dead

    """
    for oar_job_id, site in oar_job_ids:
        job_info = get_oar_job_info(oar_job_id, site)
        while 'state' not in job_info:
            job_info = get_oar_job_info(oar_job_id, site)
            sleep(5)
        if job_info['state'] == 'Error':
            return False
    return True
Пример #2
0
    def is_job_alive(self):
        rez = get_oar_job_info(self.oar_job_id)
        while 'start_date' not in rez and 'state' not in rez:
            rez = get_oar_job_info(self.oar_job_id)
            if rez['state'] == 'Error':
                return False

        if (rez["start_date"] + rez["walltime"] > time.time()):
            return True
        else:
            return False
Пример #3
0
    def get_host(self):
        """Returns the hosts from an existing reservation (if any), or from
		a new reservation"""

        # Look if there is a running job
        self.site = get_cluster_site(self.config['cluster'])
        jobs = EX5.get_current_oar_jobs([self.site])

        self.job_id = None
        for t in jobs:
            if EX5.get_oar_job_info(
                    t[0], self.site)['name'] == self.options.job_name:
                self.job_id = t[0]
                break

        if self.job_id:
            logger.info('Using job %s' % style.emph(self.job_id))
        else:
            logger.info('Making a new reservation')
            self._make_reservation(self.site)

        if not self.job_id:
            logger.error("Could not get a reservation for the job")
            exit(6)

        EX5.wait_oar_job_start(self.job_id, self.site)

        pp(EX5.get_oar_job_nodes(self.job_id, self.site))
        return EX5.get_oar_job_nodes(self.job_id, self.site)[0]
Пример #4
0
    def run(self):
        """
            Main engine method to perform the experiment
        """
        self.define_parameters()

        while len(self.sweeper.get_remaining()) > 0:
            # Getting the next combination
            comb = self.sweeper.get_next()
            logger.info(style.host(slugify(comb)) + ' has been started')
            self.get_nodes(comb)

            # If the job is broken, the program is stopped
            if get_oar_job_info(self.oar_job_id,
                                self.frontend)['state'] == 'Error':
                break

            try:
                self.workflow(comb)

                # Process all combinations that can use the same submission
                while True:
                    # Find the next combination combinations that can use the same submission
                    subcomb = self.sweeper.get_next(lambda r: filter(
                        lambda x: x['cores'] == comb['cores'] and x['cluster']
                        == comb['cluster'], r))

                    if not subcomb:
                        logger.info(
                            'No more combination for cluster=%s and cores=%s',
                            comb['cluster'], comb['cores'])
                        break
                    else:
                        logger.info(
                            style.host(slugify(subcomb)) + ' has been started')

                        if get_oar_job_info(self.oar_job_id,
                                            self.frontend)['state'] != 'Error':
                            self.workflow(subcomb)
                        else:
                            break

            # Whatever happens (errors, end of loop), the job is deleted
            finally:
                logger.info('Deleting job...')
                oardel([(self.oar_job_id, self.frontend)])
Пример #5
0
    def run(self):
        """
            Main engine method to perform the experiment
        """
        self.define_parameters()
        
        while len(self.sweeper.get_remaining()) > 0:
            # Getting the next combination
            comb = self.sweeper.get_next()
            logger.info(style.host(slugify(comb)) + ' has been started')
            self.get_nodes(comb)

            # If the job is broken, the program is stopped
            if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': 
                break

            try:
                self.workflow(comb)

                # Process all combinations that can use the same submission
                while True:
                    # Find the next combination combinations that can use the same submission
                    subcomb = self.sweeper.get_next(lambda r: 
                        filter(lambda x: x['cores'] == comb['cores']
                                        and x['cluster'] == comb['cluster'], r))

                    if not subcomb: 
                        logger.info('No more combination for cluster=%s and cores=%s',
                            comb['cluster'], comb['cores'])
                        break
                    else:
                        logger.info(style.host(slugify(subcomb)) + ' has been started')

                        if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error':
                            self.workflow(subcomb)
                        else:
                            break
            
            # Whatever happens (errors, end of loop), the job is deleted
            finally:
                logger.info('Deleting job...')
                oardel([(self.oar_job_id, self.frontend)])
Пример #6
0
def get_job_by_name(job_name, sites=None):
    """ """
    logger.detail('Looking for a job named %s', style.emph(job_name))
    if not sites:
        sites = get_g5k_sites()
    oargrid_jobs = get_current_oargrid_jobs()
    if len(oargrid_jobs) > 0:
        for g_job in oargrid_jobs:
            for job in get_oargrid_job_oar_jobs(g_job):
                info = get_oar_job_info(job[0], job[1])
                if info['name'] == job_name:
                    logger.info('Oargridjob %s found !', style.emph(g_job))
                    return g_job, None
    running_jobs = get_current_oar_jobs(sites)
    for job in running_jobs:
        info = get_oar_job_info(job[0], job[1])
        if info['name'] == job_name:
            logger.info('Job %s found on site %s !', style.emph(job[0]),
                        style.host(job[1]))
            return job
    return None, None
Пример #7
0
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(
                    Deployment(self.hosts,
                               env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" %
                            (len(deployed), len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module
                attr = get_host_attributes(self.cluster + '-1')

                ## SETUP FINISHED

                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r: filter(
                        lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
Пример #8
0
def get_job_info(site, job_id):
	jobs = EX5.get_current_oar_jobs([site])
	for t in jobs:
		info = EX5.get_oar_job_info(t[0], site)
		return EX5.get_oar_job_nodes(job_id, site)[0]
Пример #9
0
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(Deployment(self.hosts, 
                    env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" % (len(deployed), 
                                                            len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module 
                attr = get_host_attributes(self.cluster + '-1')
                
                ## SETUP FINISHED
                
                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r:
                            filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging') 
Пример #10
0
 def is_job_alive(self):
     rez=get_oar_job_info(self.oar_job_id)
     if (rez["start_date"]+rez["walltime"] > time.time()):
         return True
     else:
         return False
Пример #11
0
 def is_job_alive(self):
     rez=get_oar_job_info(self.oar_job_id, self.frontend)
     while 'state' not in rez:
         logger.info('Retrying getting oar_job_info')
         rez=get_oar_job_info(self.oar_job_id, self.frontend)
     return rez
Пример #12
0
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
Пример #13
0
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(
                                    tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if get_oar_job_info(self.oar_job_id,
                                            self.frontend)['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')