Python ParamSweeper.cancel примеры использования

Язык программирования: Python

Пространство имен/Пакет: execo_engine

Класс/Тип: ParamSweeper

Метод/Функция: cancel

Примеров на hotexamples.com: 7

Python ParamSweeper.cancel - 7 примеров найдено. Это лучшие примеры Python кода для execo_engine.ParamSweeper.cancel, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ParamSweeper(17)

done(12)

get_remaining(11)

get_next(9)

cancel(6)

skip(5)

get_sweeps(2)

get_skipped(1)

stats(1)

Пример #1

Показать файл

Файл: mpi_bench.py Проект: nirvanesque/execo-g5k-tools

class mpi_bench(Engine):

    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        if self.prepare_bench():
            logger.info('Bench prepared on all frontends')
            self.run_xp()

    def define_parameters(self):
        """Create the iterator on the parameters combinations to be explored"""
        # fixed number of nodes
        self.n_nodes = 4
        # choose a list of clusters
        clusters = ['graphene', 'petitprince', 'edel', 'paradent', 'stremi']
        #clusters = ['petitprince', 'paradent']
        # compute the maximum number of cores among all clusters
        max_core = self.n_nodes * max([
                get_host_attributes(cluster + '-1')['architecture']['smt_size']
                for cluster in clusters])
        # define the parameters
        self.parameters = {
            'cluster' : clusters,
            'n_core': filter(lambda i: i >= self.n_nodes,
                             list(takewhile(lambda i: i<max_core,
                                            (2**i for i in count(0, 1))))),
            'size' : ['A', 'B', 'C']
            }
        logger.info(self.parameters)
        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))
        logger.info('Number of parameters combinations %s' % len(self.sweeper.get_remaining()))

    def prepare_bench(self):
        """bench configuration and compilation, copy binaries to frontends
        
        return True if preparation is ok
        """
        logger.info("preparation: configure and compile benchmark")
        # the involved sites. We will do the compilation on the first of these.
        sites = list(set(map(get_cluster_site, self.parameters['cluster'])))
        # generate the bench compilation configuration
        bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core)
                                 for n_core in self.parameters['n_core']
                                 for size in self.parameters['size'] ])
        # Reserving a node because compiling on the frontend is forbidden
        # and because we need mpif77
        jobs = oarsub([(OarSubmission(resources = "nodes=1",
                                      job_type = 'allow_classic_ssh',
                                      walltime ='0:10:00'), sites[0])])
        if jobs[0][0]:
            try:
                logger.info("copying bench archive to %s" % (sites[0],))
                copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run()
                logger.info("extracting bench archive on %s" % (sites[0],))
                extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run()
                logger.info("waiting job start %s" % (jobs[0],))
                wait_oar_job_start(*jobs[0], prediction_callback = pred_cb)
                logger.info("getting nodes of %s" % (jobs[0],))
                nodes = get_oar_job_nodes(*jobs[0])
                logger.info("configure bench compilation")
                conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run()
                logger.info("compil bench")
                compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run()
                logger.info("compil finished")
            except:
                logger.error("unable to compile bench")
                return False
            finally:
                oardel(jobs)
        # Copying binaries to all other frontends
        frontends = sites[1:]
        rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', 
                       [get_host_site(nodes[0])] * len(frontends)) 
        rsync.run()
        return compilation.ok and rsync.ok

    def run_xp(self):
        """Iterate over the parameters and execute the bench"""
        while len(self.sweeper.get_remaining()) > 0:
            comb = self.sweeper.get_next()
            if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: 
                self.sweeper.skip(comb)
                continue
            logger.info('Processing new combination %s' % (comb,))
            site = get_cluster_site(comb['cluster'])
            jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes),
                                          job_type = 'allow_classic_ssh', 
                                          walltime ='0:10:00'), 
                            site)])
            if jobs[0][0]:
                try:
                    wait_oar_job_start(*jobs[0])
                    nodes = get_oar_job_nodes(*jobs[0])
                    bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % (
                        ",".join([node.address for node in nodes]),
                        comb['n_core'],
                        get_mpi_opts(comb['cluster']),
                        comb['size'],
                        comb['n_core'])
                    lu_bench = SshProcess(bench_cmd, nodes[0])
                    lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out')
                    lu_bench.run()
                    if lu_bench.ok:
                        logger.info("comb ok: %s" % (comb,))
                        self.sweeper.done(comb)
                        continue
                finally:
                    oardel(jobs)
            logger.info("comb NOT ok: %s" % (comb,))
            self.sweeper.cancel(comb)

Пример #2

Показать файл

Файл: oar_replay_workload.py Проект: oar-team/batsim-experiments

class oar_replay_workload(Engine):

    def init(self):
        parser = self.options_parser
        parser.add_option('--is_a_test',
                          dest='is_a_test',
                          action='store_true',
                          default=False,
                          help='prefix the result folder with "test", enter '
                          'a debug mode if fails and remove the job '
                          'afterward, unless it is a reservation')
        parser.add_option('--already_configured',
                          dest='already_configured',
                          action='store_true',
                          default=False,
                          help='if set, the OAR cluster is not re-configured')

        parser.add_option('--reservation_id',
                          help="Grid'5000 reservation job ID")

        parser.add_argument('experiment_config',
                            'The config JSON experiment description file')

    def setup_result_dir(self):
        is_a_test = self.options.is_a_test

        run_type = ""
        if is_a_test:
            run_type = "test_"
        self.result_dir = script_path + '/' + run_type + 'results_' + \
            time.strftime("%Y-%m-%d--%H-%M-%S")
        logger.info('resutlt directory: {}'.format(self.result_dir))

    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {
            'workload_filename': workloads
        }
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(
            str(self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(
                    job_id, site, prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn(
                        "NOT deployed nodes: {}".format(str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info(
                        "installing OAR nodes: {}".format(str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = ("oar-server oar-server-pgsql oar-user "
                                       "oar-user-pgsql postgresql python3-pip "
                                       "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(str(nodes[0])))
                    install_master = SshProcess(install_cmd + server_packages +
                                                ";" + install_oar_sched_cmd, nodes[0],
                                                connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd, nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = ("oarnotify --remove-queue default;"
                                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info(
                        "configuring OAR database: {}".format(str(nodes[0])))
                    config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar,
                                               nodes[0],
                                               connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process('scp -o BatchMode=yes -o PasswordAuthentication=no '
                            '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                            '-o ConnectTimeout=20 -rp -o User=root ' +
                            nodes[0].address + ":/var/lib/oar/.ssh"
                            ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address)

                    add_resources = SshProcess(add_resources_cmd, nodes[0],
                                               connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError("error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(combi['workload_filename'])
                    oar_replay = SshProcess(script_path + "/oar_replay.py " +
                                            combi['workload_filename'] + " " +
                                            self.result_dir + "  oar_gant_" +
                                            workload_file,
                                            nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)

Пример #3

Показать файл

Файл: aevol_besteffort_resume.py Проект: lijie97/aevol-server

class raevol_matrix(Engine):
    def __init__(self):
        """Overloading class initialization with parent and adding options"""
        super(raevol_matrix, self).__init__()
        self.options_parser.set_usage("usage: %prog ")
        self.options_parser.set_description("Execo Engine that can be used to" + \
                "perform automatic virtual machines experiments")
        self.options_parser.add_option("-n",
                                       dest="n_nodes",
                                       help="maximum number of nodes used",
                                       type="int",
                                       default=200)
        self.options_parser.add_option("-w",
                                       dest="walltime",
                                       help="walltime for the reservation",
                                       type="string",
                                       default="02:00:00")
        self.options_parser.add_option(
            "-j",
            dest="oargrid_job_id",
            help="oargrid_job_id to relaunch an engine",
            type=int,
            default=None)
        self.options_parser.add_option("-k",
                                       dest="keep_alive",
                                       help="keep reservation alive ..",
                                       action="store_true")
        self.options_parser.add_option("-u",
                                       dest="selected_cluster",
                                       help="run on a specific cluster.",
                                       type="string",
                                       default="taurus")
        self.options_parser.add_option("-o",
                                       dest="outofchart",
                                       help="Run the engine outside days",
                                       action="store_true")
        self.options_parser.add_option(
            "-s",
            dest="storage5k_job_id",
            help="storage5k_job_id to store the data",
            type=int)

    def run(self):
        """ """
        if self.options.oargrid_job_id is not None:
            self.oar_job_id = self.options.oargrid_job_id
        else:
            self.oar_job_id = None

        self.list_of_clusters = [
            'parasilo', 'paravance', 'parapluie', 'paranoia'
        ]

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()
            self.working_dir = '/data/jorouzaudcornabas_' + str(
                self.options.storage5k_job_id)

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.submit_all_available_best_effort(
                        self.list_of_clusters, self.options.walltime)
                    # self.make_reservation_local()
                # Wait that the job starts
                logger.info('Waiting that the job start ' +
                            str(self.oar_job_id))
                wait_oar_job_start(self.oar_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id)
                # Hosts deployment and configuration
                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                #===============================================================
                # deployment = Deployment(hosts = self.hosts,
                #             env_file='/home/sirimie/env/mywheezy-x64-base.env')
                # self.hosts, _ = deploy(deployment)
                #===============================================================
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = self.hosts

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/logs'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                logger.info("Starting the thread " + str(self.is_job_alive()) +
                            " " + str(len(threads.keys())))
                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False

                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]
                    logger.info("Launching thread")
                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([self.oar_job_id])
                else:
                    logger.info('Keeping job alive for debugging')

    def define_parameters(self):
        """ """
        parameters = {
            'seed': [
                51456165, 33263658, 7158785, 456847894, 1223144, 878944,
                121145, 3587842
            ],
            'mutation': ['5e-4', '1e-4', '5e-5', '5e-6'],
            'env': ['const', 'lat_3', 'lat_all'],
            'selection': [750, 2000, 4000]
        }
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweeps)
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))

    def make_reservation(self):
        """ """
        logger.info('Performing reservation')
        starttime = int(time.time() +
                        timedelta_to_seconds(datetime.timedelta(minutes=1)))
        planning = get_planning(elements=[self.options.selected_cluster],
                                starttime=starttime)
        slots = compute_slots(planning, self.options.walltime)
        wanted = {self.options.selected_cluster: 0}
        start_date, end_date, resources = find_first_slot(slots, wanted)
        wanted[self.options.selected_cluster] = resources[
            self.options.selected_cluster]
        actual_resources = distribute_hosts(resources, wanted)

        job_specs = get_jobs_specs(actual_resources, name='Aevol_diff_area')
        logger.info("try to reserve " + str(actual_resources))
        self.oargrid_job_id, _ = oargridsub(
            job_specs,
            walltime=end_date - start_date,
            job_type=['besteffort"'
                      'allow_classic_ssh'])
        logger.info("Reservation done")

    def make_reservation_local(self):
        """Perform a reservation of the required number of nodes, with 4000 IP.
        """
        logger.info('Performing reservation')
        starttime = int(time.time() +
                        timedelta_to_seconds(datetime.timedelta(minutes=1)))
        endtime = int(
            starttime +
            timedelta_to_seconds(datetime.timedelta(days=3, minutes=1)))
        self.cluster = self.options.selected_cluster
        startdate, n_nodes = self._get_nodes(starttime, endtime)

        while not n_nodes:
            logger.info('No enough nodes found between %s and %s, ' + \
                        'increasing time window',
                        format_date(starttime), format_date(endtime))
            starttime = endtime
            endtime = int(
                starttime +
                timedelta_to_seconds(datetime.timedelta(days=3, minutes=1)))
            startdate, n_nodes = self._get_nodes(starttime, endtime)
            if starttime > int(time.time() +
                               timedelta_to_seconds(datetime.timedelta(
                                   weeks=6))):
                logger.error('There are not enough nodes on %s for your ' + \
                             'experiments, abort ...', self.cluster)
                exit()
        startdate = []
        jobs_specs = get_jobs_specs({self.cluster: n_nodes},
                                    name=self.__class__.__name__)
        sub = jobs_specs[0][0]
        tmp = str(sub.resources).replace('\\', '')
        sub.resources = tmp.replace('"', '')
        sub.walltime = self.options.walltime
        sub.additional_options = '-t allow_classic_ssh -t besteffort'
        (self.oar_job_id, self.frontend) = oarsub(jobs_specs)[0]
        logger.info('Startdate: besteffort, n_nodes: %s', str(n_nodes))

    def _get_nodes(self, starttime, endtime):
        """ """
        planning = get_planning(elements=[self.cluster],
                                starttime=starttime,
                                endtime=endtime,
                                out_of_chart=self.options.outofchart)
        slots = compute_slots(planning, self.options.walltime)
        startdate = slots[0][0]
        i_slot = 0
        n_nodes = slots[i_slot][2][self.cluster]
        logger.info("nodes %s in %s at %s", str(n_nodes), str(self.cluster),
                    format_date(startdate))
        while n_nodes < self.options.n_nodes:
            logger.debug(slots[i_slot])
            startdate = slots[i_slot][0]
            n_nodes = slots[i_slot][2][self.cluster]
            i_slot += 1
            if i_slot == len(slots) - 1:
                return False, False
        return startdate, n_nodes

    def workflow(self, comb, host, comb_dir):
        """ """
        comb_ok = False
        thread_name = style.Thread(str(host).split('.')[0]) + ': '
        logger.info(thread_name + 'Starting combination ' + slugify(comb))

        try:
            self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; "

            bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify(
                comb) + '/'

            if os.path.isdir(bucketname) and os.path.exists(bucketname +
                                                            '/last_gener.txt'):
                logger.info(thread_name + "Resuming AEVOL from NFS backup")

                gen_file = open(bucketname + '/last_gener.txt', 'r')

                last_gen = gen_file.read()

                if int(last_gen) < 500000:
                    logger.info(thread_name + "Resuming AEVOL Run from " +
                                str(int(last_gen)))
                    rem = Remote(
                        self.export + 'cd ' + bucketname +
                        '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p 16'
                        + ' -e 300000 -r ' + last_gen + ' >> aevol_run.log',
                        [host]).run()
                    if rem.ok:
                        comb_ok = True
                else:
                    comb_ok = True
            else:
                Remote('mkdir -p ' + bucketname, [host]).run()

                param_file = '/home/jorouzaudcornabas/aevol_binary/execo/mut_lat/param_tmpl.in'

                logger.info(thread_name + 'Generate config file ' + param_file)

                f_template = open(param_file)
                fd, outfile = mkstemp(dir='/tmp/',
                                      prefix=slugify(comb) + '_param')
                f = os.fdopen(fd, 'w')

                for line in f_template:
                    line = line.replace('SEED_NUMBER', str(comb['seed']))
                    line = line.replace('FUZZY_VERSION', str(comb['fuzzy']))
                    if comb['move']:
                        line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.25')
                        line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.65')
                    else:
                        line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.2')
                        line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.6')
                    line = line.replace('GAUSSIAN_HEIGHT', str(comb['height']))
                    f.write(line)

                f_template.close()
                f.close()

                put_file = Put([host], [outfile],
                               remote_location=bucketname).run()
                if not put_file.ok:
                    exit()

                os.remove(outfile)

                Remote(
                    'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] +
                    ' param.in', [host]).run()

                logger.info(thread_name + "Launching AEVOL Create")
                Remote(
                    self.export + 'cd ' + bucketname +
                    '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_create > aevol_create.log',
                    [host]).run()

                logger.info(thread_name + "Launching AEVOL Run")
                rem = Remote(
                    self.export + 'cd ' + bucketname +
                    '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_run -p 16 -n 500000 > aevol_run.log',
                    [host]).run()
                if rem.ok:
                    comb_ok = True

            logger.info(thread_name + 'Get results ' + comb_dir + "/" +
                        slugify(comb))

        #try:
        #os.mkdir(comb_dir + "/" + slugify(comb))
        #except:
        #logger.warning(thread_name +
        #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb))

#shutil.rmtree(comb_dir+ "/" + slugify(comb))
#try:
#os.mkdir(comb_dir + "/" + slugify(comb))
#except:
#logger.warning(thread_name +
#'%s already exists, recreating directory', comb_dir + "/" + slugify(comb))

#get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'],
#local_location=comb_dir + "/" + slugify(comb)).run()

#for p in get_results.processes:
#if not p.ok:
#logger.error(thread_name +
#': Unable to retrieve the files for combination %s',
#slugify(comb))
#exit()

        finally:
            if comb_ok:
                self.sweeper.done(comb)
                # shutil.rmtree(bucketname)
                logger.info(thread_name + ': ' + slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(thread_name + ': ' + slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                    len(self.sweeper.get_remaining()))

    def is_job_alive(self):
        rez = get_oar_job_info(self.oar_job_id)
        if rez['state'] == 'Error':
            return False

        if (rez["start_date"] + rez["walltime"] > time.time()):
            return True
        else:
            return False

    def get_immediately_available_nodes(self,
                                        list_of_clusters,
                                        walltime,
                                        ignore_besteffort=False):
        planning = get_planning(list_of_clusters, ignore_besteffort=False)
        slots = compute_slots(planning, walltime)
        wanted = {cluster: 0 for cluster in list_of_clusters}
        start_date, end_date, resources = find_first_slot(slots, wanted)
        actual_resources = {
            resource: n_nodes
            for resource, n_nodes in resources.iteritems()
            if resource in list_of_clusters and n_nodes > 0
        }
        return start_date, end_date, actual_resources

    def submit_all_available_best_effort(self, list_of_clusters, walltime):
        start_date, end_date, resources = self.get_immediately_available_nodes(
            list_of_clusters, walltime, ignore_besteffort=False)
        job_specs = get_jobs_specs(resources)
        for j, f in job_specs:
            j.job_type = "besteffort"
            j.additional_options = "-t allow_classic_ssh"
        #jobs = oarsub(job_specs)
        (self.oar_job_id, self.frontend) = oarsub(job_specs)[0]
        print job_specs
        return self.oar_job_id

Пример #4

Показать файл

class paasage_simu(Engine):

    JVM = 'java'
    SGCBJAR = 'SGCB_nTier.jar'
    PJDUMP = 'pj_dump'
    RSCRIPT = 'Rscript'

    def __init__(self):
        """Overloading class initialization with parent and adding options"""
        super(paasage_simu, self).__init__()
        self.options_parser.set_usage("usage: %prog ")
        self.options_parser.set_description("Execo Engine that can be used to" + \
                "perform automatic virtual machines experiments")
        self.options_parser.add_option("-n",
                                       dest="n_nodes",
                                       help="maximum number of nodes used",
                                       type="int",
                                       default=200)
        self.options_parser.add_option("-w",
                                       dest="walltime",
                                       help="walltime for the reservation",
                                       type="string",
                                       default="05:00:00")
        self.options_parser.add_option(
            "-j",
            dest="oargrid_job_id",
            help="oargrid_job_id to relaunch an engine",
            type=int)
        self.options_parser.add_option("-k",
                                       dest="keep_alive",
                                       help="keep reservation alive ..",
                                       action="store_true")

    def run(self):
        """ """
        if self.options.oargrid_job_id:
            self.oargrid_job_id = self.options.oargrid_job_id
        else:
            self.oargrid_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oargrid_job_id is None:
                    self.make_reservation()
                # Wait that the job starts
                logger.info('Waiting that the job start')
                wait_oargrid_job_start(self.oargrid_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oargrid_job_nodes(self.oargrid_job_id)
                # Hosts deployment and configuration

                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                deployment = Deployment(
                    hosts=self.hosts,
                    env_file='/home/sirimie/env/mywheezy-x64-base.env')
                self.hosts, _ = deploy(deployment)

                Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*",
                       self.hosts).run()
                Remote(
                    "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml",
                    self.hosts).run()
                Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml",
                       self.hosts).run()

                Put(self.hosts, [
                    "run_all_execo.py", "xml_gen_execo.py", "conf.xml",
                    "platform_aws.xml", "cloud_ec2.xml"
                ],
                    remote_location="/home/Work/sgcbntier/paasage_demo/").run(
                    )
                logger.info("Done")

                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = [
                    host for host in self.hosts for i in range(
                        get_host_attributes(host)['architecture']['smt_size'])
                ]

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/csv_results'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]

                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oargrid_job_id = None

        finally:
            if self.oargrid_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oargriddel([self.oargrid_job_id])
                else:
                    logger.info('Keeping job alive for debugging')

    def define_parameters(self):
        """ """
        parameters = self.get_parameters("conf.xml")
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweeps)
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))

    def make_reservation(self):
        """ """
        logger.info('Performing reservation')
        starttime = int(time.time() +
                        timedelta_to_seconds(datetime.timedelta(minutes=1)))
        planning = get_planning(elements=['grid5000'], starttime=starttime)
        slots = compute_slots(planning, self.options.walltime)
        wanted = {"grid5000": 0}
        start_date, end_date, resources = find_first_slot(slots, wanted)
        wanted['grid5000'] = min(resources['grid5000'], self.options.n_nodes)
        actual_resources = distribute_hosts(resources, wanted)

        job_specs = get_jobs_specs(actual_resources, name='Paasage_Simu')
        logger.info("try to reserve " + str(actual_resources))
        self.oargrid_job_id, _ = oargridsub(job_specs,
                                            start_date,
                                            walltime=end_date - start_date,
                                            job_type="deploy")
        logger.info("Reservation done")

    def create_string(self, param):
        res_str = ""
        for key, value in param.iteritems():
            res_str += key + "_" + str(value) + "_"
        return res_str

    def workflow(self, comb, host, comb_dir):
        """ """
        comb_ok = False
        thread_name = style.Thread(host.split('.')[0]) + ': '
        logger.info(thread_name + 'Starting combination ' + slugify(comb))

        try:
            logger.info(thread_name + 'Generate conf file')
            param_str = self.create_string(comb)

            Remote(
                "python /home/Work/sgcbntier/paasage_demo/xml_gen_execo.py --cb "
                + param_str, [host]).run()

            logger.info(thread_name + 'Run code')
            Remote(
                "cd /home/Work/sgcbntier/paasage_demo/ ; python run_all_execo.py --cb %s"
                % param_str, [host]).run()

            logger.info(thread_name + 'Get results')

            traceFile = "ntier_" + param_str
            get_results = Get([host], [
                "/home/Work/sgcbntier/paasage_demo/csv/REQTASK_" + traceFile +
                ".csv"
            ],
                              local_location=comb_dir).run()

            for p in get_results.processes:
                if not p.ok:
                    logger.error(
                        host +
                        ': Unable to retrieve the files for combination %s',
                        slugify(comb))
                    exit()

            comb_ok = True
        finally:
            if comb_ok:
                self.sweeper.done(comb)
                logger.info(thread_name + ': ' + slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(thread_name + ': ' + slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                    len(self.sweeper.get_remaining()))

    def is_job_alive(self):
        rez = get_oargrid_job_info(self.oargrid_job_id)
        return (rez["start_date"] + rez["walltime"] > time.time())

    def get_parameters(self, file_name):
        """Get the parameters to sweep, from the configuration file"""
        tree = ET.parse(file_name)
        rootSrc = tree.getroot()
        param = dict()

        for inst in rootSrc.iter("instance"):
            ty = inst.get("type")
            qt = inst.get("quantity")
            if (qt.isdigit()):
                param[ty] = qt
            else:
                ends = qt.split("-")
                param[ty] = range(int(ends[0]), int(ends[1]) + 1)

        return param

Пример #5

Показать файл

class compil_aevol(Engine):

    def __init__(self):
        """Overloading class initialization with parent and adding options"""
        super(compil_aevol, self).__init__()

    def run(self):
        """ """
        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()
	    
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
	      comb = self.sweeper.get_next()
              if comb:
		self.workflow(comb)
	finally:
	  logger.info("Compilation DONE")
	  
    def define_parameters(self):
        """ """
        parameters = {
	  'blas' : ['none','mkl','atlas','openblas'],
	  'experiment' : ['aevol','raevol'],
	  'compilator' : ['gcc','intel'],
	  'parallel' : ['openmp','tbb']
        }
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
        logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))

    def workflow(self, comb):
        """ """
        comb_ok = False
        logger.info(slugify(comb) + \
                             ' starts to compile')
        try:
	    export = "source /opt/intel/bin/compilervars.sh intel64; "
	    
	    src_directory = "/home/arrouan/workspace/aevol/git/world/aevol/"
	    
	    bin_directory = "/home/arrouan/workspace/aevol/compiled_binary/"
    
	    configure_option = "--with-tracing --without-x"
	    
	    if comb['parallel'] == 'tbb':
	      configure_option += " --with-tbb"
	      
	    if comb['blas'] == 'openblas':
	      configure_option += " --with-blas"
	    elif comb['blas'] == 'mkl':
	      configure_option += " --with-mkl"
	    elif comb['blas'] == 'atlas':
	      configure_option += " --with-atlas"
	        
	    if comb['experiment'] == 'raevol':
	      configure_option += " --with-raevol"
	      
	    if comb['compilator'] == 'intel':
	      configure_option += " CXX=icc"
	      
	    full_bin_directory = bin_directory + comb['experiment']+'_'+comb['compilator']+'_'+comb['parallel']+'_'+comb['blas']
	    
	    try:
	      os.mkdir(full_bin_directory)
            except:
	      for f in os.listdir(full_bin_directory):
		os.remove(full_bin_directory + "/" + f)
		
	    p = Process(export+'cd '+src_directory+'; autoreconf; ./configure '+configure_option+'; make clean; make; cp src/aevol_run '+full_bin_directory+'/; cp src/aevol_create '+full_bin_directory+'/')
	    p.shell = True
	    #
	    p.run()
	    
	    print p.stdout
	    
            comb_ok = True
        finally:
            if comb_ok:
                self.sweeper.done(comb)
	        logger.info(slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))

    def is_job_alive(self):
        rez=get_oar_job_info(self.oar_job_id)
        if (rez["start_date"]+rez["walltime"] > time.time()):
            return True
        else:
            return False

Пример #6

Показать файл

Файл: oar_replay_workload.py Проект: oar-team/batsim-experiments

class oar_replay_workload(Engine):
    def init(self):
        parser = self.options_parser
        parser.add_option('--is_a_test',
                          dest='is_a_test',
                          action='store_true',
                          default=False,
                          help='prefix the result folder with "test", enter '
                          'a debug mode if fails and remove the job '
                          'afterward, unless it is a reservation')
        parser.add_option('--already_configured',
                          dest='already_configured',
                          action='store_true',
                          default=False,
                          help='if set, the OAR cluster is not re-configured')

        parser.add_option('--reservation_id',
                          help="Grid'5000 reservation job ID")

        parser.add_argument('experiment_config',
                            'The config JSON experiment description file')

    def setup_result_dir(self):
        is_a_test = self.options.is_a_test

        run_type = ""
        if is_a_test:
            run_type = "test_"
        self.result_dir = script_path + '/' + run_type + 'results_' + \
            time.strftime("%Y-%m-%d--%H-%M-%S")
        logger.info('resutlt directory: {}'.format(self.result_dir))

    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {'workload_filename': workloads}
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(str(
            self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(job_id,
                                   site,
                                   prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn("NOT deployed nodes: {}".format(
                        str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info("installing OAR nodes: {}".format(
                        str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = (
                        "oar-server oar-server-pgsql oar-user "
                        "oar-user-pgsql postgresql python3-pip "
                        "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(
                        str(nodes[0])))
                    install_master = SshProcess(
                        install_cmd + server_packages + ";" +
                        install_oar_sched_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd,
                                           nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = (
                        "oarnotify --remove-queue default;"
                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info("configuring OAR database: {}".format(
                        str(nodes[0])))
                    config_master = SshProcess(
                        create_db + ";" + config_oar_sched + ";" + start_oar,
                        nodes[0],
                        connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process(
                        'scp -o BatchMode=yes -o PasswordAuthentication=no '
                        '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                        '-o ConnectTimeout=20 -rp -o User=root ' +
                        nodes[0].address + ":/var/lib/oar/.ssh"
                        ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key],
                        "/var/lib/oar/",
                        connection_params={
                            'user': '******'
                        }).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(
                            node=node.address)

                    add_resources = SshProcess(
                        add_resources_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError(
                            "error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(
                        combi['workload_filename'])
                    oar_replay = SshProcess(
                        script_path + "/oar_replay.py " +
                        combi['workload_filename'] + " " + self.result_dir +
                        "  oar_gant_" + workload_file, nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)

Пример #7

Показать файл

Файл: DVFS.py Проект: eswak/dvfs-linux-kernel

class DVFS(Engine):
    def __init__(self, result_dir, cluster, site):
        Engine.__init__(self)
        self.result_dir = result_dir
        self.cluster = cluster
        self.site = site

    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.run_xp()

    def define_parameters(self):
        nbNodes = len(self.cluster)
        # build parameters and make nbCore list per benchmark
        freqList = [2534000, 2000000, 1200000]
        n_nodes = float(len(self.cluster))
        max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l',
                              self.cluster[0],
                              connection_params={
                                  'user': '******'
                              }).run().stdout
        max_core = n_nodes * float(max_core)
        even = filter(
            lambda i: i > n_nodes,
            list(takewhile(lambda i: i < max_core,
                           (2**i for i in count(0, 1)))))
        powerTwo = filter(
            lambda i: i > n_nodes,
            list(takewhile(lambda i: i < max_core,
                           (i**2 for i in count(0, 1)))))

        # Define parameters
        self.parameters = {
            'Repeat': [1],
            "Freq": [2534000],
            "NPBclass": ['C'],
            "Benchmark": {
                # 'ft': {
                #     'n_core': even
                #     },
                # 'ep': {
                #     'n_core': even
                #     },
                # 'lu': {
                #     'n_core': even
                #     },
                # 'is': {
                #     'n_core': even
                #     },
                # 'sg': {
                #     'n_core': even
                #     },
                # 'bt': {
                #     'n_core': powerTwo
                #     },
                'sp': {
                    'n_core': powerTwo
                }
            }
        }

        logger.info(self.parameters)
        # make all possible parameters object,
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))

    def run_xp(self):

        master = self.cluster[0]
        opt = ''
        """Iterate over the parameters and execute the bench"""
        while len(self.sweeper.get_remaining()) > 0:
            # Take sweeper
            comb = self.sweeper.get_next()

            logger.info('Processing new combination %s' % (comb, ))

            try:
                # metric from linux sar tools, works with clock
                def takeMetric(
                        path,
                        startTime,
                        endTime,
                        metric=['cpu', 'mem', 'disk', 'swap', 'network']):
                    opt = ''
                    cmd_template_sar = (
                        "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}"
                    )
                    for met in metric:
                        if met == 'cpu':
                            opt = 'u'
                        elif met == 'mem':
                            opt = 'r'
                        elif met == 'disk':
                            opt = 'dp'
                        elif met == 'swap':
                            opt = 'S'
                        elif met == 'network':
                            opt = 'n DEV'

                        cmd = cmd_template_sar.format(opt=opt,
                                                      startTime=startTime,
                                                      endTime=endTime)
                        for host in self.cluster:
                            hE = SshProcess(cmd,
                                            host,
                                            connection_params={'user': '******'})
                            hE.run()
                            stdMetric = host + '-' + met + '.txt'
                            with open(os.path.join(path, stdMetric),
                                      "w") as sout:
                                sout.write(hE.stdout)

                #Set CPU Freq and Policy according current combination
                cmd_template_Freq_Policy = ("cpufreq-set -r  -g {policy}")
                cmd_template_Freq = ("cpufreq-set -r -f {freq}")

                if comb['Freq'] == 'OnDemand':
                    cmd_freq_policy = cmd_template_Freq_Policy.format(
                        policy='ondemand')
                    Remote(cmd_freq_policy,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()
                elif comb['Freq'] == 'conservative':
                    cmd_freq_policy = cmd_template_Freq_Policy.format(
                        policy='conservative')
                    Remote(cmd_freq_policy,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()
                else:
                    cmd_freq_policy = cmd_template_Freq_Policy.format(
                        policy='userspace')
                    Remote(cmd_freq_policy,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()
                    cmd_freq = cmd_template_Freq.format(freq=comb['Freq'])
                    Remote(cmd_freq,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()

                # build command
                src = 'source /opt/intel-performance-snapshoot/apsvars.sh'
                cmd_mpirun_template = (
                    "mpirun {opt} -f /root/cluster.txt -np {pr1} aps -r '/tmp/log/' /tmp/NPB/npb-mpi/bin/{typeNPB}.{NPBclass}.{pr2}"
                )
                cmd_mpirun = cmd_mpirun_template.format(
                    opt='',
                    pr1=comb['n_core'],
                    typeNPB=comb['Benchmark'],
                    NPBclass=comb['NPBclass'],
                    pr2=comb['n_core'])
                cmd = "{}; /tmp/NPB/bin/runMPI.sh '{}' '{}'".format(
                    src, cmd_mpirun, slugify(comb))

                curPath = self.result_dir + slugify(comb)

                # run Mpi through execo remote SshProcess
                def runMpi(cmd):
                    act = SshProcess(cmd,
                                     master,
                                     connection_params={'user': '******'},
                                     shell=True)
                    act.run()

                    if not os.path.exists(curPath):
                        os.makedirs(curPath)

                    with open(os.path.join(curPath, "stdout.txt"),
                              "a+") as sout, open(
                                  os.path.join(curPath, "stderr.txt"),
                                  "w") as serr:
                        sout.write(act.stdout)
                        serr.write(act.stderr)
                    return act.ok

                # start clock and exec command in the master node
                time.sleep(5)
                startUnix = int(time.time())
                start24Hour = datetime.datetime.fromtimestamp(
                    startUnix).strftime('%H:%M:%S')

                task1 = runMpi(cmd)

                endUnix = int(time.time())
                end24Hour = datetime.datetime.fromtimestamp(endUnix).strftime(
                    '%H:%M:%S')
                time.sleep(5)

                with open(os.path.join(curPath, "executionTime.txt"),
                          "w") as sout:
                    sout.write(
                        'ExecTime:{}\nStartDate:{}\nEndDate:{}\n'.format(
                            str(endUnix - startUnix), start24Hour, end24Hour))

                takeMetric(curPath, start24Hour, end24Hour,
                           ['cpu', 'mem', 'disk', 'swap', 'network'])

                # collect power from kWAPI: grid5000 infrastructure made tool
                for hostname in self.cluster:
                    powerOut = '{}_power'.format(hostname)
                    collect_metric(startUnix, endUnix, 'power', curPath,
                                   self.site, powerOut, hostname)

                st = '/tmp/out/' + slugify(comb)
                intelAppPerf = str(st + '.html')

                # get the data from ['Application Performance Snapshot', 'Storage Performance Snapshot']
                # https://software.intel.com/en-us/performance-snapshot
                Get(master, [intelAppPerf],
                    curPath,
                    connection_params={
                        'user': '******'
                    }).run()
                if task1:
                    logger.info("comb ok: %s" % (comb, ))
                    self.sweeper.done(comb)
                    continue

            except OSError as err:
                print("OS error: {0}".format(err))
            except ValueError:
                print("Could not convert data to an integer.")
            except:
                print("Unexpected error:", sys.exc_info()[0])
                raise

            logger.info("comb NOT ok: %s" % (comb, ))
            self.sweeper.cancel(comb)