Пример #1
0
    def _add_xml_elements(self):
        """Add sites, clusters, hosts to self.state """
        _state = self.state
        logger.debug('Initial state \n %s', prettify(_state))
        for site in self.sites:
            SubElement(_state, 'site', attrib={'id': site})
        else:
            el_site = SubElement(_state, 'site', attrib={'id': 'unknown'})
        logger.debug('Sites added \n %s', prettify(_state))
        for cluster in self.clusters:
            el_site = _state.find("./site[@id='" + get_cluster_site(cluster) \
                                  + "']")
            SubElement(el_site, 'cluster', attrib={'id': cluster})
        else:
            el_cluster = SubElement(el_site, 'cluster', attrib={'id': 'unknown'})
        logger.debug('Clusters added \n %s', prettify(_state))
        print 'xxxxxxx', self.hosts
        hosts_attr = get_CPU_RAM_FLOPS(self.hosts)
        for host in self.hosts:
            if host in get_g5k_hosts(): 
                el_cluster = _state.find(".//cluster/[@id='" +
                                         get_host_cluster(host) + "']")

            SubElement(el_cluster, 'host', attrib={'id': host,
                                                   'state': 'Undeployed',
                                                   'cpu': str(hosts_attr[host]['CPU']),
                                                   'mem': str(hosts_attr[host]['RAM'])})
        logger.debug('Hosts added \n %s', prettify(_state))
Пример #2
0
def _build_reservation_criteria(machines, networks):
    criteria = {}
    # machines reservations
    for desc in machines:
        cluster = desc["cluster"]
        nodes = desc["nodes"]
        if nodes:
            site = api.get_cluster_site(cluster)
            criterion = "{cluster='%s'}/nodes=%s" % (cluster, nodes)
            criteria.setdefault(site, []).append(criterion)

    # network reservations
    vlans = [network for network in networks if network["type"] in KAVLAN_TYPE]
    for desc in vlans:
        site = desc["site"]
        n_type = desc["type"]
        criterion = "{type='%s'}/vlan=1" % n_type
        criteria.setdefault(site, []).append(criterion)

    subnets = [
        network for network in networks if network["type"] in SUBNET_TYPE
    ]
    for desc in subnets:
        site = desc["site"]
        n_type = desc["type"]
        criterion = "%s=1" % n_type
        criteria.setdefault(site, []).append(criterion)

    return criteria
Пример #3
0
def make_reservation(resources, job_name, walltime, reservation_date):
    machines = resources["machines"]
    networks = resources["networks"]

    criteria = {}
    # machines reservations
    for desc in machines:
        cluster = desc["cluster"]
        nodes = desc["nodes"]
        site = api.get_cluster_site(cluster)
        criterion = "{cluster='%s'}/nodes=%s" % (cluster, nodes)
        criteria.setdefault(site, []).append(criterion)

    # network reservations
    non_prod = [network for network in networks if network["type"] != "prod"]
    for desc in non_prod:
        site = desc["site"]
        n_type = desc["type"]
        criterion = "{type='%s'}/vlan=1" % n_type
        criteria.setdefault(site, []).append(criterion)

    jobs_specs = [(ex5.OarSubmission(resources='+'.join(c), name=job_name), s)
                  for s, c in criteria.items()]

    # Make the reservation
    gridjob, _ = ex5.oargridsub(jobs_specs,
                                walltime=walltime.encode('ascii', 'ignore'),
                                reservation_date=reservation_date,
                                job_type='deploy')

    if gridjob is None:
        raise Exception('No oar job was created')
    return gridjob
Пример #4
0
    def get_host(self):
        """Returns the hosts from an existing reservation (if any), or from
		a new reservation"""

        # Look if there is a running job
        self.site = get_cluster_site(self.config['cluster'])
        jobs = EX5.get_current_oar_jobs([self.site])

        self.job_id = None
        for t in jobs:
            if EX5.get_oar_job_info(
                    t[0], self.site)['name'] == self.options.job_name:
                self.job_id = t[0]
                break

        if self.job_id:
            logger.info('Using job %s' % style.emph(self.job_id))
        else:
            logger.info('Making a new reservation')
            self._make_reservation(self.site)

        if not self.job_id:
            logger.error("Could not get a reservation for the job")
            exit(6)

        EX5.wait_oar_job_start(self.job_id, self.site)

        pp(EX5.get_oar_job_nodes(self.job_id, self.site))
        return EX5.get_oar_job_nodes(self.job_id, self.site)[0]
Пример #5
0
    def _add_xml_elements(self):
        """Add sites, clusters, hosts to self.state """
        _state = self.state
        logger.debug('Initial state \n %s', prettify(_state))
        for site in self.sites:
            SubElement(_state, 'site', attrib={'id': site})
        else:
            el_site = SubElement(_state, 'site', attrib={'id': 'unknown'})
        logger.debug('Sites added \n %s', prettify(_state))
        for cluster in self.clusters:
            el_site = _state.find("./site[@id='" + get_cluster_site(cluster) \
                                  + "']")
            SubElement(el_site, 'cluster', attrib={'id': cluster})
        else:
            el_cluster = SubElement(el_site,
                                    'cluster',
                                    attrib={'id': 'unknown'})
        logger.debug('Clusters added \n %s', prettify(_state))
        hosts_attr = get_CPU_RAM_FLOPS(self.hosts)
        for host in self.hosts:
            if host in get_g5k_hosts():
                el_cluster = _state.find(".//cluster/[@id='" +
                                         get_host_cluster(host) + "']")

            SubElement(el_cluster,
                       'host',
                       attrib={
                           'id': host,
                           'state': 'Undeployed',
                           'cpu': str(hosts_attr[host]['CPU']),
                           'mem': str(hosts_attr[host]['RAM'])
                       })
        logger.debug('Hosts added \n %s', prettify(_state))
Пример #6
0
def make_reservation(resources, job_name, walltime, reservation_date, queue,
                     reservation_type):
    machines = resources["machines"]
    networks = resources["networks"]

    criteria = {}
    # machines reservations
    for desc in machines:
        cluster = desc["cluster"]
        nodes = desc["nodes"]
        site = api.get_cluster_site(cluster)
        criterion = "{cluster='%s'}/nodes=%s" % (cluster, nodes)
        criteria.setdefault(site, []).append(criterion)

    # network reservations
    non_prod = [network for network in networks if network["type"] != "prod"]
    for desc in non_prod:
        site = desc["site"]
        n_type = desc["type"]
        criterion = "{type='%s'}/vlan=1" % n_type
        criteria.setdefault(site, []).append(criterion)

    jobs_specs = [(ex5.OarSubmission(resources='+'.join(c), name=job_name), s)
                  for s, c in criteria.items()]

    # Make the reservation
    if reservation_type == "oar":
        oarsub_description = jobs_specs[0][0]
        oarsub_description.walltime = walltime
        oarsub_description.reservation_date = reservation_date
        oarsub_description.job_type = 'deploy'
        oarsub_description.queue = queue
        gridjobs = ex5.oarsub(jobs_specs)
        if len(gridjobs) > 0:
            gridjob = gridjobs[0][0]
    else:
        gridjob, _ = ex5.oargridsub(jobs_specs,
                                    walltime=walltime,
                                    reservation_date=reservation_date,
                                    job_type='deploy',
                                    queue=queue)

    if gridjob is None:
        raise Exception('No oar job was created')
    return gridjob
Пример #7
0
def grid_make_reservation(job_name, walltime, reservation_date, queue,
                          job_type, machines, networks):
    criteria = {}
    # machines reservations
    for desc in machines:
        cluster = desc["cluster"]
        nodes = desc["nodes"]
        site = api.get_cluster_site(cluster)
        criterion = "{cluster='%s'}/nodes=%s" % (cluster, nodes)
        criteria.setdefault(site, []).append(criterion)

    # network reservations
    vlans = [network for network in networks if network["type"] in KAVLAN_TYPE]
    for desc in vlans:
        site = desc["site"]
        n_type = desc["type"]
        criterion = "{type='%s'}/vlan=1" % n_type
        criteria.setdefault(site, []).append(criterion)

    subnets = [
        network for network in networks if network["type"] in SUBNET_TYPE
    ]
    for desc in subnets:
        site = desc["site"]
        n_type = desc["type"]
        criterion = "%s=1" % n_type
        criteria.setdefault(site, []).append(criterion)

    jobs_specs = [(ex5.OarSubmission(resources='+'.join(c), name=job_name), s)
                  for s, c in criteria.items()]

    # Make the reservation
    gridjob, _ = ex5.oargridsub(jobs_specs,
                                walltime=walltime,
                                reservation_date=reservation_date,
                                job_type=job_type,
                                queue=queue)

    if gridjob is None:
        raise Exception('No oar job was created')
    return gridjob
Пример #8
0
    def _make_reservation(self):
        """Make a new reservation."""

        # Extract the list of criteria (ie, `oarsub -l
        # *criteria*`) in order to compute a specification for the
        # reservation.
        criteria = {}
        # Actual criteria are :
        # - Number of node per site
        for cluster, roles in self.config["resources"].items():
            site = get_cluster_site(cluster)
            nb_nodes = reduce(operator.add, map(int, roles.values()))
            criterion = "{cluster='%s'}/nodes=%s" % (cluster, nb_nodes)
            criteria.setdefault(site, []).append(criterion)

        for site, vlan in self.config["vlans"].items():
            criteria.setdefault(site, []).append(vlan)

        # Compute the specification for the reservation
        jobs_specs = [(OarSubmission(resources = '+'.join(c),
                                     name = self.config["name"]), s)
                      for s, c in criteria.items()]
        logger.info("Criteria for the reservation: %s" % pf(jobs_specs))

        # Make the reservation
        gridjob, _ = EX5.oargridsub(
            jobs_specs,
            reservation_date=self.config['reservation'],
            walltime=self.config['walltime'].encode('ascii', 'ignore'),
            job_type='deploy'
        )

        # TODO - move this upper to not have a side effect here
        if gridjob is not None:
            self.gridjob = gridjob
            logger.info("Using new oargrid job %s" % style.emph(self.gridjob))
        else:
            logger.error("No oar job was created.")
            sys.exit(26)
Пример #9
0
    def _create_reservation(self, conf):
        """Create the OAR Job specs."""
        provider_conf = conf['provider']
        criteria = {}
        # NOTE(msimonin): Traverse all cluster demands in alphebetical order
        # test_create_reservation_different_site needs to know the order
        for cluster, roles in sorted(conf["resources"].items(),
                                     key=lambda x: x[0]):
            site = api.get_cluster_site(cluster)
            nb_nodes = reduce(operator.add, map(int, roles.values()))
            criterion = "{cluster='%s'}/nodes=%s" % (cluster, nb_nodes)
            criteria.setdefault(site, []).append(criterion)

        for site, vlan in provider_conf["vlans"].items():
            criteria.setdefault(site, []).append(vlan)

        # Compute the specification for the reservation
        jobs_specs = [(OarSubmission(resources='+'.join(c),
                                     name=provider_conf["name"]), s)
                      for s, c in criteria.items()]
        logging.info("Criteria for the reservation: %s" % pf(jobs_specs))
        return jobs_specs
Пример #10
0
def find_free_slot(slots, resources_wanted):
    """Return the first slot (a tuple start date, end date, resources) with enough resources

    :param slots: list of slots returned by ``compute_slots``

    :param resources_wanted: a dict describing the wanted ressources
      ``{'grid5000': 50, 'lyon': 20, 'stremi': 10 }``"""
    # We need to add the clusters nodes to the total nodes of a site
    real_wanted = resources_wanted.copy()
    for cluster, n_nodes in resources_wanted.items():
        if cluster in get_g5k_clusters(queues=None):
            site = get_cluster_site(cluster)
            if site in resources_wanted:
                real_wanted[site] += n_nodes

    for slot in slots:
        vlan_free = True
        if 'kavlan' in resources_wanted:
            if isinstance(slot[2]['kavlan'], int):
                if slot[2]['kavlan'] == 0:
                    vlan_free = False
            elif isinstance(slot[2]['kavlan'], list):
                if len(slot[2]['kavlan']) == 0:
                    vlan_free = False
        slot_ok = True
        for element, n_nodes in slot[2].items():
            if element in real_wanted and real_wanted[element] > n_nodes \
                and real_wanted != 'kavlan':
                slot_ok = False

        if slot_ok and vlan_free:
            if 'kavlan' in resources_wanted:
                resources_wanted['kavlan'] = slot[2]['kavlan']
            return slot

    return None, None, None
Пример #11
0
def get_vms_slot(vms, elements, slots, excluded_elements=None):
    """Return a slot with enough RAM and CPU """
    chosen_slot = None
    mem = vms[0]['mem']
    cpu = vms[0]['n_cpu']
    req_ram = sum([vm['mem'] for vm in vms])
    req_cpu = sum([vm['n_cpu'] for vm in vms]) / 3
    logger.debug('RAM %s CPU %s', req_ram, req_cpu)

    for element in excluded_elements:
        if element in get_g5k_sites():
            excluded_elements += [cluster for cluster
                                  in get_site_clusters(element)
                                  if cluster not in excluded_elements]

    if 'grid5000' in elements:
        clusters = [cluster for cluster in get_g5k_clusters()
                    if cluster not in excluded_elements
                    and get_cluster_site not in excluded_elements]
    else:
        clusters = [element for element in elements
                    if element in get_g5k_clusters()
                    and element not in excluded_elements]
        for element in elements:
            if element in get_g5k_sites():
                clusters += [cluster
                    for cluster in get_site_clusters(element)
                        if cluster not in clusters
                        and cluster not in excluded_elements]

    for slot in slots:
        hosts = []
        for element in slot[2]:
            if str(element) in clusters:
                n_hosts = slot[2][element]
                for i in range(n_hosts):
                    hosts.append(Host(str(element + '-1.' + \
                            get_cluster_site(element) + '.grid5000.fr')))
        attr = get_CPU_RAM_FLOPS(hosts)['TOTAL']

        if attr['CPU'] > req_cpu and attr['RAM'] > req_ram:
            chosen_slot = slot
            break
        del hosts[:]

    if chosen_slot is None:
        return None, None

    resources_needed = {}
    resources_available = chosen_slot[2]
    logger.debug('resources available' + pformat(resources_available))
    iter_clusters = cycle(clusters)
    while req_ram > 0 or req_cpu > 0:
        cluster = iter_clusters.next()
        if resources_available[cluster] == 0:
            clusters.remove(cluster)
            iter_clusters = cycle(clusters)
        else:
            host = cluster + '-1'
            attr = get_CPU_RAM_FLOPS([host])
            resources_available[cluster] -= 1
            req_ram -= float(attr[host]['RAM'] / mem) * mem
            req_cpu -= float(attr[host]['CPU'] / cpu) * cpu

            if cluster not in resources_needed:
                resources_needed[cluster] = 0
            resources_needed[cluster] += 1

    if 'kavlan' in elements:
        resources_needed['kavlan'] = 1

    logger.debug('resources needed' + pformat(resources_needed))
    return chosen_slot[0], distribute_hosts(chosen_slot[2], resources_needed,
                                            excluded_elements)
Пример #12
0
    def run(self):
        """Execute a test suite. The execution workflow is as follows:

        1. Parse command-line arguments.

        2. Define the parameters of the tests from the specified configuration
        file. Generate all the combination to test from the given parameters.

        3. Consume the combinations.

          3.1. Setup the cluster if it has not been done (first time or after a
          reservation ends.

          3.2. Load the dataset into the Hadoop cluster.

          3.3. Perform the experiments corresponding to the combinations linked
          to the loaded dataset.

        4. Clean all resources.
        """

        # Get parameters
        self.cluster = self.args[0]
        self.n_nodes = int(self.args[1])
        self.config_file = self.args[2]
        self.site = get_cluster_site(self.cluster)

        if not os.path.exists(self.config_file):
            logger.error("Params file " + self.config_file + " does not exist")
            sys.exit(1)

        # Set oar job id
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        # Main
        try:
            # Creation of the main iterator used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:

                # SETUP
                # If no job, we make a reservation and prepare the hosts for the
                # experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                    success = self.setup()
                    if not success:
                        break
                else:
                    self.hosts = get_oar_job_nodes(self.oar_job_id,
                                                   self.frontend)
                if not self.hc:
                    self.hc = HadoopCluster(self.hosts)
                # SETUP FINISHED

                # Getting the next combination (which requires a ds deployment)
                comb = self.sweeper.get_next()
                self.raw_comb = comb.copy()
                self.comb = comb
                self.prepare_dataset(comb)
                self.xp_wrapper(comb)

                # subloop over the combinations that use the same dataset
                while True:
                    newcomb = self.sweeper.get_next(
                        lambda r: filter(self._uses_same_ds, r))
                    if newcomb:
                        self.raw_comb = newcomb.copy()
                        try:
                            self.xp_wrapper(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    pass
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')

            # Clean cluster
            if self.hc:
                if self.hc.initialized:
                    self.hc.clean()

            # Close summary files
            if self.summary_file:
                self.summary_file.close()
            if self.ds_summary_file:
                self.ds_summary_file.close()
Пример #13
0
    def run(self):
        """Execute a test suite. The execution workflow is as follows:

        1. Parse command-line arguments.

        2. Define the parameters of the tests from the specified configuration
        file. Generate all the combination to test from the given parameters.

        3. Consume the combinations.

          3.1. Setup the cluster if it has not been done (first time or after a
          reservation ends.

          3.2. Load the dataset into the Hadoop cluster.

          3.3. Perform the experiments corresponding to the combinations linked
          to the loaded dataset.

        4. Clean all resources.
        """

        # Get parameters
        self.cluster = self.args[0]
        self.n_nodes = int(self.args[1])
        self.config_file = self.args[2]
        self.site = get_cluster_site(self.cluster)

        if not os.path.exists(self.config_file):
            logger.error("Params file " + self.config_file + " does not exist")
            sys.exit(1)

        # Set oar job id
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        # Main
        try:
            # Creation of the main iterator used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:

                # SETUP
                # If no job, we make a reservation and prepare the hosts for the
                # experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                    success = self.setup()
                    if not success:
                        break
                else:
                    self.hosts = get_oar_job_nodes(self.oar_job_id,
                                                   self.frontend)
                if not self.hc:
                    self.hc = HadoopCluster(self.hosts)
                # SETUP FINISHED

                # Getting the next combination (which requires a ds deployment)
                comb = self.sweeper.get_next()
                self.raw_comb = comb.copy()
                self.comb = comb
                self.prepare_dataset(comb)
                self.xp_wrapper(comb)

                # subloop over the combinations that use the same dataset
                while True:
                    newcomb = self.sweeper.get_next(
                        lambda r: filter(self._uses_same_ds, r))
                    if newcomb:
                        self.raw_comb = newcomb.copy()
                        try:
                            self.xp_wrapper(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    pass
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')

            # Clean cluster
            if self.hc:
                if self.hc.initialized:
                    self.hc.clean()

            # Close summary files
            if self.summary_file:
                self.summary_file.close()
            if self.ds_summary_file:
                self.ds_summary_file.close()
Пример #14
0
	def run(self):
		# Defining experiment parameters
		self.parameters = {
			'n_clients': [400, 450, 500, 550, 600],
			'n_transitions': [10000]
		}
		cluster = 'griffon'
		sweeps = sweep(self.parameters)
		sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
		server_out_path = os.path.join(self.result_dir, "server.out")
		
		self._updateStat(sweeper.stats())
		
		# Loop on the number of nodes
		while True:
			# Taking the next parameter combinations
			comb = sweeper.get_next()
			if not comb: break

			# Performing the submission on G5K
			site = get_cluster_site(cluster)
			self._log("Output will go to " + self.result_dir)
			
			n_nodes = int(math.ceil(float(comb['n_clients']) / EX5.get_host_attributes(cluster + '-1')['architecture']['smt_size'])) + 1
			self._log("Reserving {0} nodes on {1}".format(n_nodes, site))
			
			resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes)
			submission = EX5.OarSubmission(resources = resources, job_type = 'allow_classic_ssh', walltime ='00:10:00')
			
			job = EX5.oarsub([(submission, site)])
			self.__class__._job = job
			
			# Sometimes oarsub fails silently
			if job[0][0] is None:
				print("\nError: no job was created")
				sys.exit(1)
				
			# Wait for the job to start
			self._log("Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL))
			EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback = prediction)
			nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1])
			
			# Deploying nodes
			#deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file')
			#run_deploy = EX5.deploy(deployment)
			#nodes_deployed = run_deploy.hosts[0]
			
			# Copying active_data program on all deployed hosts
			EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params = {'user': '******'}).run()
			EX.Put([nodes[0]], '../server.policy', connexion_params = {'user': '******'}).run()
			
			# Loop on the number of requests per client process
			while True:
				# Split the nodes
				clients = nodes[1:]
				server = nodes[0] 
				
				self._log("Running experiment with {0} nodes and {1} transitions per client".format(len(clients), comb['n_transitions']))
				
				# Launching Server on one node
				out_handler = FileOutputHandler(server_out_path)
				launch_server = EX.Remote('java -jar active-data-lib-0.1.2.jar', [server], stdout_handler = out_handler, stderr_handler = out_handler).start()
				self._log("Server started on " + server.address)
				time.sleep(2)
				
				# Launching clients
				rank=0
				n_cores = EX5.get_host_attributes(clients[0])['architecture']['smt_size'];
				cores = nodes * n_cores
				cores = cores[0:comb['n_clients']] # Cut out the additional cores
				
				client_connection_params = {
						'taktuk_gateway': 'lyon.grid5000.fr',
						'host_rewrite_func': None
				}
				
				self._log("Launching {0} clients...".format(len(cores)))
				
				client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \
								"{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions'])
				client_out_handler = FileOutputHandler(os.path.join(self.result_dir, "clients.out"))
				client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \
									stdout_handler = client_out_handler, stderr_handler = client_out_handler)
				
				client_request.run()
				
				if not client_request.ok():
					# Some client failed, please panic
					self._log("One or more client process failed. Enjoy reading their outputs.")
					self._log("OUTPUT STARTS -------------------------------------------------\n")
					for process in client_request.processes():
						print("----- {0} returned {1}".format(process.host().address, process.exit_code()))
						if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL)
						if not process.stderr() == "": print(RED + process.stderr() + NORMAL)
						print("")
					self._log("OUTPUT ENDS ---------------------------------------------------\n")
					sweeper.skip(comb)
					launch_server.kill()
					launch_server.wait()
				else:
					# Waiting for server to end
					launch_server.wait()
				
					# Getting log files
					distant_path = OUT_FILE_FORMAT.format(len(cores), comb['n_transitions'])
					local_path = distant_path
					
					EX.Get([server], distant_path).run()
					
					EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run()
					
					EX.Get([server], 'client_*.out', local_location = self.result_dir)
					EX.Remote('rm -f client_*.out', [server])
					
					self._log("Finishing experiment with {0} clients and {1} transitions per client".format(comb['n_clients'], comb['n_transitions']))
					
					sweeper.done(comb)
					
				sub_comb = sweeper.get_next (filtr = lambda r: filter(lambda s: s["n_clients"] == comb['n_clients'], r))
				self._updateStat(sweeper.stats())
				
				if not sub_comb: 
					# Killing job
					EX5.oar.oardel(job)
					self.__class__._job = None
					break
				else: 
					comb = sub_comb
		
		print ""
Пример #15
0
def get_vms_slot(vms, elements, slots, excluded_elements=None):
    """Return a slot with enough RAM and CPU """
    chosen_slot = None
    mem = vms[0]['mem']
    cpu = vms[0]['n_cpu']
    req_ram = sum([vm['mem'] for vm in vms])
    req_cpu = sum([vm['n_cpu'] for vm in vms]) / 3
    logger.debug('RAM %s CPU %s', req_ram, req_cpu)

    for element in excluded_elements:
        if element in get_g5k_sites():
            excluded_elements += [
                cluster for cluster in get_site_clusters(element)
                if cluster not in excluded_elements
            ]

    if 'grid5000' in elements:
        clusters = [
            cluster for cluster in get_g5k_clusters()
            if cluster not in excluded_elements
            and get_cluster_site not in excluded_elements
        ]
    else:
        clusters = [
            element for element in elements if element in get_g5k_clusters()
            and element not in excluded_elements
        ]
        for element in elements:
            if element in get_g5k_sites():
                clusters += [
                    cluster for cluster in get_site_clusters(element)
                    if cluster not in clusters
                    and cluster not in excluded_elements
                ]

    for slot in slots:
        hosts = []
        for element in slot[2]:
            if str(element) in clusters:
                n_hosts = slot[2][element]
                for i in range(n_hosts):
                    hosts.append(Host(str(element + '-1.' + \
                            get_cluster_site(element) + '.grid5000.fr')))
        attr = get_CPU_RAM_FLOPS(hosts)['TOTAL']

        if attr['CPU'] > req_cpu and attr['RAM'] > req_ram:
            chosen_slot = slot
            break
        del hosts[:]

    if chosen_slot is None:
        return None, None

    resources_needed = {}
    resources_available = chosen_slot[2]
    logger.debug('resources available' + pformat(resources_available))
    iter_clusters = cycle(clusters)
    while req_ram > 0 or req_cpu > 0:
        cluster = iter_clusters.next()
        if resources_available[cluster] == 0:
            clusters.remove(cluster)
            iter_clusters = cycle(clusters)
        else:
            host = cluster + '-1'
            attr = get_CPU_RAM_FLOPS([host])
            resources_available[cluster] -= 1
            req_ram -= float(attr[host]['RAM'] / mem) * mem
            req_cpu -= float(attr[host]['CPU'] / cpu) * cpu

            if cluster not in resources_needed:
                resources_needed[cluster] = 0
            resources_needed[cluster] += 1

    if 'kavlan' in elements:
        resources_needed['kavlan'] = 1

    logger.debug('resources needed' + pformat(resources_needed))
    return chosen_slot[0], distribute_hosts(chosen_slot[2], resources_needed,
                                            excluded_elements)
Пример #16
0
def get_jobs_specs(resources, excluded_elements=None, name=None):
    """ Generate the several job specifications from the dict of resources and
    the blacklisted elements

    :param resources: a dict, whose keys are Grid'5000 element and values the
      corresponding number of n_nodes

    :param excluded_elements: a list of elements that won't be used

    :param name: the name of the jobs that will be given
    """
    jobs_specs = []
    if excluded_elements == None:
        excluded_elements = []

    # Creating the list of sites used
    sites = []
    real_resources = resources.copy()
    for resource in resources:
        if resource in get_g5k_sites() and resource not in sites:
            sites.append(resource)
        if resource in get_g5k_clusters(queues=None):
            if resource not in excluded_elements:
                site = get_cluster_site(resource)
                if site not in sites:
                    sites.append(site)
                if site not in real_resources:
                    real_resources[site] = 0

    # Checking if we need a Kavlan, a KaVLAN global or none
    get_kavlan = 'kavlan' in resources
    if get_kavlan:
        kavlan = 'kavlan'
        n_sites = 0
        for resource in real_resources:
            if resource in sites:
                n_sites += 1
            if n_sites > 1:
                kavlan += '-global'
                break

    blacklisted_hosts = {}
    for element in excluded_elements:
        if element not in get_g5k_clusters(queues=None) + get_g5k_sites():
            site = get_host_site(element)
            if not 'site' in blacklisted_hosts:
                blacklisted_hosts[site] = [element]
            else:
                blacklisted_hosts[site].append(element)

    for site in sites:
        sub_resources = ''
        # Adding a KaVLAN if needed
        if get_kavlan:
            if not 'global' in kavlan:
                sub_resources = "{type='" + kavlan + "'}/vlan=1+"
                get_kavlan = False
            elif site in resources['kavlan']:
                sub_resources = "{type='" + kavlan + "'}/vlan=1+"
                get_kavlan = False

        base_sql = '{'
        end_sql = '}/'

        # Creating blacklist SQL string for hosts
        host_blacklist = False
        str_hosts = ''
        if site in blacklisted_hosts and len(blacklisted_hosts[site]) > 0:
            str_hosts = ''.join([
                "host not in ('" + get_host_longname(host) + "') and "
                for host in blacklisted_hosts[site]
            ])
            host_blacklist = True

        #Adding the clusters blacklist
        str_clusters = str_hosts if host_blacklist else ''
        cl_blacklist = False
        clusters_nodes = 0
        for cluster in get_site_clusters(site, queues=None):
            if cluster in resources and resources[cluster] > 0:
                if str_hosts == '':
                    sub_resources += "{cluster='" + cluster + "'}"
                else:
                    sub_resources += base_sql + str_hosts + "cluster='" + \
                        cluster + "'" + end_sql
                sub_resources += "/nodes=" + str(resources[cluster]) + '+'
                clusters_nodes += resources[cluster]
            if cluster in excluded_elements:
                str_clusters += "cluster not in ('" + cluster + "') and "
                cl_blacklist = True

        # Generating the site blacklist string from host and cluster blacklist
        str_site = ''
        if host_blacklist or cl_blacklist:
            str_site += base_sql
            if not cl_blacklist:
                str_site += str_hosts[:-4]
            else:
                str_site += str_clusters[:-4]
            str_site = str_site + end_sql

        if real_resources[site] > 0:
            sub_resources += str_site + "nodes=" + str(real_resources[site]) +\
                '+'

        if sub_resources != '':
            jobs_specs.append((OarSubmission(resources=sub_resources[:-1],
                                             name=name), site))

    return jobs_specs
Пример #17
0
nodefile = "./gridnodes-uniq"
try:
    os.remove(nodes_gr1); os.remove(nodes_service); os.remove(nodes_gr2); os.remove(nodes_gr3);
except OSError:
    pass

cluster = 'orion'

sites = []
hosts_gr1 = {'cluster' : 'orion', 'nodes' : n_nodes, 'cores' : 12}#n_nodes}
hosts_gr2 = {'cluster' : 'sagittaire', 'nodes' : n_nodes, 'cores' : 12}
hosts_gr3 = {'cluster' : 'taurus', 'nodes' : n_nodes, 'cores' : 2}

hosts_service = {'cluster' : 'sagittaire', 'nodes' : 2} # MA + Client

site = get_cluster_site(hosts_service["cluster"])
user_frontend_connexion_params={'user': '******', 'default_frontend': "lyon", 'ssh_options': ('-tt', '-o', 'BatchMode=yes', '-o', 'PasswordAuthentication=no', '-o', 'StrictHostKeyChecking=no', '-o', 'UserKnownHostsFile=/dev/null', '-o', 'ConnectTimeout=45')}
root_connexion_params={'user': '******', 'ssh_options': ('-tt', '-o', 'BatchMode=yes', '-o', 'PasswordAuthentication=no', '-o', 'StrictHostKeyChecking=no', '-o', 'UserKnownHostsFile=/dev/null', '-o', 'ConnectTimeout=45')}

logger.info("Job Submission...")
subs = []
sub_resources=''
for hosts in [hosts_gr1,hosts_service,hosts_gr2,hosts_gr3]:
    sub_resources += "{cluster=\\'"+hosts["cluster"]+"\\'}/nodes="+str(hosts["nodes"])+'+'
subs.append((OarSubmission(resources=sub_resources[:-1],walltime = walltime,additional_options = oargridsub_opts),site)) #

total_cores = 0
for hosts in [hosts_gr1,hosts_gr2,hosts_gr3]:
    total_cores += ( hosts["nodes"]*hosts["cores"] )
print total_cores
 
Пример #18
0
def distribute_hosts(resources_available,
                     resources_wanted,
                     excluded_elements=None,
                     ratio=None):
    """ Distribute the resources on the different sites and cluster

    :param resources_available: a dict defining the resources available

    :param resources_wanted: a dict defining the resources available you really want

    :param excluded_elements: a list of elements that won't be used

    :param ratio: if not None (the default), a float between 0 and 1,
      to actually only use a fraction of the resources."""
    if excluded_elements == None: excluded_elements = []
    resources = {}
    #Defining the cluster you want
    clusters_wanted = {}
    for element, n_nodes in resources_wanted.items():
        if element in get_g5k_clusters(queues=None):
            clusters_wanted[element] = n_nodes
    for cluster, n_nodes in clusters_wanted.items():
        nodes = n_nodes if n_nodes > 0 else resources_available[cluster]
        resources_available[get_cluster_site(cluster)] -= nodes
        resources[cluster] = nodes

    # Blacklisting clusters
    for element in excluded_elements:
        if element in get_g5k_clusters(
                queues=None) and element in resources_available:
            resources_available['grid5000'] -= resources_available[element]
            resources_available[get_cluster_site(
                element)] -= resources_available[element]
            resources_available[element] = 0

    #Defining the sites you want
    sites_wanted = {}
    for element, n_nodes in resources_wanted.items():
        if element in get_g5k_sites() and element not in excluded_elements:
            sites_wanted[element] = n_nodes
    for site, n_nodes in sites_wanted.items():
        resources[site] = n_nodes if n_nodes > 0 else resources_available[site]

    # Blacklisting sites
    for element in excluded_elements:
        if element in get_g5k_sites() and element in resources_available:
            resources_available['grid5000'] -= resources_available[element]
            resources_available[element] = 0

    #Distributing hosts on grid5000 elements
    logger.debug(pformat(resources_wanted))
    if 'grid5000' in resources_wanted:
        g5k_nodes = resources_wanted['grid5000'] if resources_wanted[
            'grid5000'] > 0 else resources_available['grid5000']

        total_nodes = 0

        sites = [
            element for element in resources_available
            if element in get_g5k_sites()
        ]
        iter_sites = cycle(sites)

        while total_nodes < g5k_nodes:
            site = next(iter_sites)
            if resources_available[site] == 0:
                sites.remove(site)
                iter_sites = cycle(sites)
            else:
                resources_available[site] -= 1
                if site in resources:
                    resources[site] += 1
                else:
                    resources[site] = 1
                total_nodes += 1
    logger.debug(pformat(resources))

    if 'kavlan' in resources_wanted:
        resources['kavlan'] = resources_available['kavlan']

    # apply optional ratio
    if ratio != None:
        resources.update(
            (x, int(floor(y * ratio))) for x, y in resources.items())

    return resources
Пример #19
0
    def run(self):
        """Inherited method, put here the code for running the engine."""

        # Get parameters
        self.cluster = self.args[0]
        self.n_nodes = int(self.args[1])
        self.config_file = self.args[2]
        self.site = get_cluster_site(self.cluster)

        if not os.path.exists(self.config_file):
            logger.error("Params file " + self.config_file + " does not exist")
            sys.exit(1)

        # Set oar job id
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        # Main
        try:
            # Creation of the main iterator used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:

                ## SETUP
                # If no job, we make a reservation and prepare the hosts for the
                # experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                    success = self.setup()
                    if not success:
                        break
                else:
                    self.hosts = get_oar_job_nodes(self.oar_job_id,
                                                   self.frontend)
                ## SETUP FINISHED

                logger.info("Setup finished in hosts " + str(self.hosts))

                test_threads = []
                for h in self.hosts:
                    t = TestThread(h, self.comb_manager, self.stats_manager)
                    test_threads.append(t)
                    t.name = "th_" + str(h.address).split(".")[0]
                    t.start()

                for t in test_threads:
                    t.join()

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    pass
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')

            # Close stats
            self.stats_manager.close()
Пример #20
0
    def run(self):
        # Defining experiment parameters
        self.parameters = {
            'n_clients': [400, 450, 500, 550, 600],
            'n_transitions': [10000]
        }
        cluster = 'griffon'
        sweeps = sweep(self.parameters)
        sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
        server_out_path = os.path.join(self.result_dir, "server.out")

        self._updateStat(sweeper.stats())

        # Loop on the number of nodes
        while True:
            # Taking the next parameter combinations
            comb = sweeper.get_next()
            if not comb: break

            # Performing the submission on G5K
            site = get_cluster_site(cluster)
            self._log("Output will go to " + self.result_dir)

            n_nodes = int(
                math.ceil(
                    float(comb['n_clients']) / EX5.get_host_attributes(
                        cluster + '-1')['architecture']['smt_size'])) + 1
            self._log("Reserving {0} nodes on {1}".format(n_nodes, site))

            resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes)
            submission = EX5.OarSubmission(resources=resources,
                                           job_type='allow_classic_ssh',
                                           walltime='00:10:00')

            job = EX5.oarsub([(submission, site)])
            self.__class__._job = job

            # Sometimes oarsub fails silently
            if job[0][0] is None:
                print("\nError: no job was created")
                sys.exit(1)

            # Wait for the job to start
            self._log(
                "Waiting for job {0} to start...\n".format(BOLD_MAGENTA +
                                                           str(job[0][0]) +
                                                           NORMAL))
            EX5.wait_oar_job_start(job[0][0],
                                   job[0][1],
                                   prediction_callback=prediction)
            nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1])

            # Deploying nodes
            #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file')
            #run_deploy = EX5.deploy(deployment)
            #nodes_deployed = run_deploy.hosts[0]

            # Copying active_data program on all deployed hosts
            EX.Put([nodes[0]],
                   '../dist/active-data-lib-0.1.2.jar',
                   connexion_params={
                       'user': '******'
                   }).run()
            EX.Put([nodes[0]],
                   '../server.policy',
                   connexion_params={
                       'user': '******'
                   }).run()

            # Loop on the number of requests per client process
            while True:
                # Split the nodes
                clients = nodes[1:]
                server = nodes[0]

                self._log(
                    "Running experiment with {0} nodes and {1} transitions per client"
                    .format(len(clients), comb['n_transitions']))

                # Launching Server on one node
                out_handler = FileOutputHandler(server_out_path)
                launch_server = EX.Remote(
                    'java -jar active-data-lib-0.1.2.jar', [server],
                    stdout_handler=out_handler,
                    stderr_handler=out_handler).start()
                self._log("Server started on " + server.address)
                time.sleep(2)

                # Launching clients
                rank = 0
                n_cores = EX5.get_host_attributes(
                    clients[0])['architecture']['smt_size']
                cores = nodes * n_cores
                cores = cores[
                    0:comb['n_clients']]  # Cut out the additional cores

                client_connection_params = {
                    'taktuk_gateway': 'lyon.grid5000.fr',
                    'host_rewrite_func': None
                }

                self._log("Launching {0} clients...".format(len(cores)))

                client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \
                    "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions'])
                client_out_handler = FileOutputHandler(
                    os.path.join(self.result_dir, "clients.out"))
                client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \
                     stdout_handler = client_out_handler, stderr_handler = client_out_handler)

                client_request.run()

                if not client_request.ok():
                    # Some client failed, please panic
                    self._log(
                        "One or more client process failed. Enjoy reading their outputs."
                    )
                    self._log(
                        "OUTPUT STARTS -------------------------------------------------\n"
                    )
                    for process in client_request.processes():
                        print("----- {0} returned {1}".format(
                            process.host().address, process.exit_code()))
                        if not process.stdout() == "":
                            print(GREEN + process.stdout() + NORMAL)
                        if not process.stderr() == "":
                            print(RED + process.stderr() + NORMAL)
                        print("")
                    self._log(
                        "OUTPUT ENDS ---------------------------------------------------\n"
                    )
                    sweeper.skip(comb)
                    launch_server.kill()
                    launch_server.wait()
                else:
                    # Waiting for server to end
                    launch_server.wait()

                    # Getting log files
                    distant_path = OUT_FILE_FORMAT.format(
                        len(cores), comb['n_transitions'])
                    local_path = distant_path

                    EX.Get([server], distant_path).run()

                    EX.Local('mv ' + distant_path + ' ' +
                             os.path.join(self.result_dir, local_path)).run()

                    EX.Get([server],
                           'client_*.out',
                           local_location=self.result_dir)
                    EX.Remote('rm -f client_*.out', [server])

                    self._log(
                        "Finishing experiment with {0} clients and {1} transitions per client"
                        .format(comb['n_clients'], comb['n_transitions']))

                    sweeper.done(comb)

                sub_comb = sweeper.get_next(filtr=lambda r: filter(
                    lambda s: s["n_clients"] == comb['n_clients'], r))
                self._updateStat(sweeper.stats())

                if not sub_comb:
                    # Killing job
                    EX5.oar.oardel(job)
                    self.__class__._job = None
                    break
                else:
                    comb = sub_comb

        print ""
Пример #21
0
def get_planning(elements=['grid5000'],
                 vlan=False,
                 subnet=False,
                 storage=False,
                 out_of_chart=False,
                 starttime=None,
                 endtime=None,
                 ignore_besteffort=True,
                 queues='default'):
    """Retrieve the planning of the elements (site, cluster) and others resources.
    Element planning structure is ``{'busy': [(123456,123457), ... ], 'free': [(123457,123460), ... ]}.``

    :param elements: a list of Grid'5000 elements ('grid5000', <site>, <cluster>)

    :param vlan: a boolean to ask for KaVLAN computation

    :param subnet: a boolean to ask for subnets computation

    :param storage: a boolean to ask for sorage computation

    :param out_of_chart: if True, consider that days outside weekends are busy

    :param starttime: start of time period for which to compute the planning, defaults to now + 1 minute

    :param endtime: end of time period for which to compute the planning, defaults to 4 weeks from now

    :param ignore_besteffort: True by default, to consider the resources with besteffort jobs as available

    :param queues: list of oar queues for which to get the planning

    Return a dict whose keys are sites, whose values are dict whose keys
    are cluster, subnets, kavlan or storage,
    whose values are planning dicts, whose keys are hosts, subnet address range,
    vlan number or chunk id planning respectively.
    """
    if not starttime:
        starttime = int(time() + timedelta_to_seconds(timedelta(minutes=1)))
    starttime = int(get_unixts(starttime))
    if not endtime:
        endtime = int(starttime +
                      timedelta_to_seconds(timedelta(weeks=4, minutes=1)))
    endtime = int(get_unixts(endtime))
    if 'grid5000' in elements:
        sites = elements = get_g5k_sites()
    else:
        sites = list(
            set([site for site in elements if site in get_g5k_sites()] + [
                get_cluster_site(cluster) for cluster in elements
                if cluster in get_g5k_clusters(queues=queues)
            ] + [
                get_host_site(host)
                for host in elements if host in get_g5k_hosts()
                or get_host_shortname(host) in get_g5k_hosts()
            ]))
    if len(sites) == 0:
        logger.error('Wrong elements given: %s' % (elements, ))
        return None
    planning = {}
    for site in sites:
        planning[site] = {}
        for cluster in get_site_clusters(site, queues=queues):
            planning[site][cluster] = {}

    for site in sites:
        if vlan:
            planning[site].update({'vlans': {}})
        if subnet:
            planning[site].update({'subnets': {}})
        if storage:
            planning[site].update({'storage': {}})

    if _retrieve_method == 'API':
        _get_planning_API(planning, ignore_besteffort)
    elif _retrieve_method == 'PostgreSQL':
        _get_planning_PGSQL(planning, ignore_besteffort)

    if out_of_chart:
        _add_charter_to_planning(planning, starttime, endtime)

    for site_pl in planning.values():
        for res_pl in site_pl.values():
            for el_planning in res_pl.values():
                el_planning['busy'].sort()
                _merge_el_planning(el_planning['busy'])
                _trunc_el_planning(el_planning['busy'], starttime, endtime)
                _fill_el_planning_free(el_planning, starttime, endtime)

    # cleaning
    real_planning = deepcopy(planning)
    for site, site_pl in planning.items():
        for cl, cl_pl in site_pl.items():
            if cl in ['vlans']:
                continue
            keep_cluster = False
            for h in cl_pl:
                if not (get_host_site(h) in elements
                        or get_host_cluster(h) in elements
                        or get_host_shortname(h) in elements or h in elements):
                    del real_planning[site][cl][h]
                else:
                    keep_cluster = True
            if not keep_cluster:
                del real_planning[site][cl]

    return real_planning