def change_conf(self, params, conf_file=None, default_file=MR_CONF_FILE): """Modify Hadoop configuration. This method copies the configuration files from the first host of each g5k cluster conf dir into a local temporary dir, do all the changes in place and broadcast the new configuration files to all hosts. Args: params (dict of str:str): The parameters to be changed in the form key:value. conf_file (str, optional): The file where parameters should be set. If not specified, all files are checked for the parameter name and the parameter is set in the file where the property is found. If not found, the parameter is set in the default file. default_file (str, optional): The default conf file where to set the parameter if not found. Only applies when conf_file is not set. """ for cluster in self.hw.get_clusters(): hosts = cluster.get_hosts() # Copy conf files from first host in the cluster action = Remote("ls " + self.conf_dir + "/*.xml", [hosts[0]]) action.run() output = action.processes[0].stdout remote_conf_files = [] for f in output.split(): remote_conf_files.append(os.path.join(self.conf_dir, f)) tmp_dir = "/tmp/mliroz_temp_hadoop/" if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) action = Get([hosts[0]], remote_conf_files, tmp_dir) action.run() # Do replacements in temp file if conf_file: f = os.path.join(tmp_dir, conf_file) for name, value in params.iteritems(): replace_in_xml_file(f, name, value, True) else: temp_conf_files = [os.path.join(tmp_dir, f) for f in os.listdir(tmp_dir)] for name, value in params.iteritems(): for f in temp_conf_files: if replace_in_xml_file(f, name, value): break else: # Property not found - add it in MR_CONF_FILE logger.info("Parameter with name " + name + " has not " "been found in any conf file. Setting it " "in " + default_file) f = os.path.join(tmp_dir, default_file) replace_in_xml_file(f, name, value, True) # Copy back the files to all hosts self._copy_conf(tmp_dir, hosts)
def _configure_servers(self, hosts=None): """Configure servers and host-dependant parameters. Args: hosts (list of Host, optional): The list of hosts to take into account in the configuration. If not specified, all the hosts of the Spark cluster are used. The first host of this list is always used as the reference. """ if not hosts: hosts = self.hosts conf_file = os.path.join(self.temp_conf_dir, "hive-site.xml") replace_in_xml_file( conf_file, "fs.default.name", "hdfs://" + self.hc.master.address + ":" + str(self.hc.hdfs_port) + "/", True) replace_in_xml_file( conf_file, "mapred.job.tracker", self.hc.master.address + ":" + str(self.hc.mapred_port), True) replace_in_xml_file(conf_file, "hive.metastore.warehouse.dir", self.warehouse_dir, True) replace_in_xml_file( conf_file, "javax.jdo.option.ConnectionURL", "jdbc:derby:;" "databaseName=" + self.metastore_dir + ";" "create=true", True)
def _set_common_params(self, params, conf_dir, default_tuning=False): """Replace common parameters. Some user-specified values are overwritten. Args: params (str): Already defined parameters over all the clusters. conf_dir (str): The path of the directory with the configuration files. default_tuning (bool, optional): Whether to use automatic tuning based on some best practices or leave the default parameters. """ core_file = os.path.join(conf_dir, CORE_CONF_FILE) mr_file = os.path.join(conf_dir, MR_CONF_FILE) replace_in_xml_file(core_file, "fs.default.name", "hdfs://%s:%d/" % (self.master.address, self.hdfs_port), create_if_absent=True) replace_in_xml_file(core_file, "hadoop.tmp.dir", self.hadoop_temp_dir, create_if_absent=True) replace_in_xml_file(core_file, "topology.script.file.name", self.conf_dir + "/topo.sh", create_if_absent=True) replace_in_xml_file(mr_file, "mapred.job.tracker", "%s:%d" % (self.master.address, self.mapred_port), create_if_absent=True)
def _set_cluster_params(self, cluster, params, conf_dir, default_tuning=False): """Replace cluster-dependent parameters Args: cluster (PhysicalCluster): The PhysicalCluster object to take into account in the configuration. params (str): Already defined parameters over all the clusters. conf_dir (str): The path of the directory with the configuration files. default_tuning (bool, optional): Whether to use automatic tuning based on some best practices or leave the default parameters. """ yarn_file = os.path.join(conf_dir, YARN_CONF_FILE) cname = cluster.get_name() max_mem = params[cname]["max_cont_mem"] max_cores = params[cname]["max_cont_cores"] replace_in_xml_file(yarn_file, "yarn.nodemanager.resource.memory-mb", str(max_mem), create_if_absent=True, replace_if_present=default_tuning) replace_in_xml_file(yarn_file, "yarn.nodemanager.resource.cpu-vcores", str(max_cores), create_if_absent=True, replace_if_present=default_tuning)
def _set_cluster_params(self, cluster, params, conf_dir, default_tuning=False): """Replace cluster-dependent parameters. Args: cluster (PhysicalCluster): The PhysicalCluster object to take into account in the configuration. params (str): Already defined parameters over all the clusters. conf_dir (str): The path of the directory with the configuration files. default_tuning (bool, optional): Whether to use automatic tuning based on some best practices or leave the default parameters. """ cname = cluster.get_name() mem_per_slot_mb = params[cname]["mem_per_slot_mb"] map_slots = params[cname]["map_slots"] red_slots = params[cname]["red_slots"] if default_tuning: mr_file = os.path.join(conf_dir, MR_CONF_FILE) replace_in_xml_file(mr_file, "mapred.tasktracker.map.tasks.maximum", str(map_slots), create_if_absent=True, replace_if_present=False) replace_in_xml_file(mr_file, "mapred.tasktracker.reduce.tasks.maximum", str(red_slots), create_if_absent=True, replace_if_present=False) replace_in_xml_file(mr_file, "mapred.child.java.opts", "-Xmx%dm" % mem_per_slot_mb, create_if_absent=True, replace_if_present=False)
def _set_common_params(self, params, conf_dir, default_tuning=False): """Replace common parameters. Some user-specified values are overwritten. Args: params (str): Already defined parameters over all the clusters. conf_dir (str): The path of the directory with the configuration files. default_tuning (bool, optional): Whether to use automatic tuning based on some best practices or leave the default parameters. """ core_file = os.path.join(conf_dir, CORE_CONF_FILE) yarn_file = os.path.join(conf_dir, YARN_CONF_FILE) mr_file = os.path.join(conf_dir, MR_CONF_FILE) global_params = params["global"] sch_max_mem = global_params["sch_max_mem"] sch_max_cores = global_params["sch_max_cores"] # General and HDFS replace_in_xml_file(core_file, "fs.defaultFS", "hdfs://%s:%d/" % (self.master.address, self.hdfs_port), create_if_absent=True, replace_if_present=True) replace_in_xml_file(core_file, "hadoop.tmp.dir", self.hadoop_temp_dir, create_if_absent=True, replace_if_present=True) replace_in_xml_file(core_file, "topology.script.file.name", self.conf_dir + "/topo.sh", create_if_absent=True, replace_if_present=True) # YARN replace_in_xml_file(yarn_file, "yarn.resourcemanager.hostname", self.master.address, create_if_absent=True) replace_in_xml_file(mr_file, "mapreduce.framework.name", "yarn", create_if_absent=True, replace_if_present=True) replace_in_xml_file(yarn_file, "yarn.nodemanager.aux-services", "mapreduce_shuffle", create_if_absent=True, replace_if_present=True) replace_in_xml_file(yarn_file, "yarn.scheduler.maximum-allocation-mb", str(sch_max_mem), create_if_absent=True, replace_if_present=default_tuning) replace_in_xml_file(yarn_file, "yarn.scheduler.maximum-allocation-vcores", str(sch_max_cores), create_if_absent=True, replace_if_present=default_tuning) if default_tuning: # YARN min_cont_mem = global_params["min_cont_mem"] replace_in_xml_file(yarn_file, "yarn.scheduler.minimum-allocation-mb", str(min_cont_mem), create_if_absent=True, replace_if_present=True) # MR memory settings map_mem = global_params["map_mem"] red_mem = global_params["red_mem"] map_java_heap = global_params["map_java_heap"] red_java_heap = global_params["red_java_heap"] replace_in_xml_file(mr_file, "mapreduce.map.memory.mb", str(map_mem), create_if_absent=True, replace_if_present=True) replace_in_xml_file(mr_file, "mapreduce.map.java.opts", "-Xmx%dm" % map_java_heap, create_if_absent=True, replace_if_present=True) replace_in_xml_file(mr_file, "mapreduce.reduce.memory.mb", str(red_mem), create_if_absent=True, replace_if_present=True) replace_in_xml_file(mr_file, "mapreduce.reduce.java.opts", "-Xmx%dm" % red_java_heap, create_if_absent=True, replace_if_present=True) # MR core settings replace_in_xml_file(mr_file, "mapreduce.map.cpu.vcores", "1", create_if_absent=True, replace_if_present=True) replace_in_xml_file(mr_file, "mapreduce.map.reduce.vcores", "1", create_if_absent=True, replace_if_present=True) # MR shuffle io_sort_mb = global_params["io_sort_mb"] io_sort_factor = global_params["io_sort_factor"] replace_in_xml_file(mr_file, "mapreduce.map.output.compress", "true", create_if_absent=True, replace_if_present=True) replace_in_xml_file(mr_file, "mapreduce.task.io.sort.mb", str(io_sort_mb), create_if_absent=True, replace_if_present=True) replace_in_xml_file(mr_file, "mapreduce.task.io.sort.factor", str(io_sort_factor), create_if_absent=True, replace_if_present=True)