def _create_master_and_slave_conf(self, conf_dir): """Configure master and create slaves configuration files.""" defs_file = conf_dir + "/spark-defaults.conf" # Configure master spark_master = read_param_in_props_file(defs_file, "spark.master") if spark_master and spark_master.startswith("local"): logger.warn("Your default configuration executes Spark locally. " "Note that unless otherwise specified when launching " "your scripts, the distributed configuration will be " "ignored.") else: if self.mode == STANDALONE_MODE: # Always override? spark_master = "spark://%s:%d" % (self.master.address, self.port) elif self.mode == YARN_MODE: if spark_master: if spark_master not in ["yarn-client", "yarn-cluster"]: logger.warn("Provided spark.master is not compatible " "with YARN mode. Overriding with " "'yarn-client'") spark_master = "yarn-client" else: spark_master = "yarn-client" write_in_props_file(defs_file, "spark.master", spark_master, create_if_absent=True, override=True) # Configure slaves with open(conf_dir + "/slaves", "w") as slaves_file: for s in self.hosts: slaves_file.write(s.address + "\n")
def change_conf(self, params, conf_file=None, default_file=SPARK_CONF_FILE): """Modify Spark configuration. This method copies the configuration files from the first host of each g5k cluster conf dir into a local temporary dir, do all the changes in place and broadcast the new configuration files to all hosts. Args: params (dict of str:str): The parameters to be changed in the form key:value. conf_file (str, optional): The file where parameters should be set. If not specified, all files are checked for the parameter name and the parameter is set in the file where the property is found. If not found, the parameter is set in the default file. default_file (str, optional): The default conf file where to set the parameter if not found. Only applies when conf_file is not set. """ for cluster in self.hw.get_clusters(): hosts = cluster.get_hosts() # Copy conf files from first host in the cluster action = Remote("ls " + self.conf_dir + "/*.conf", [hosts[0]]) action.run() output = action.processes[0].stdout remote_conf_files = [] for f in output.split(): remote_conf_files.append(os.path.join(self.conf_dir, f)) tmp_dir = "/tmp/mliroz_temp_spark/" if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) action = Get([hosts[0]], remote_conf_files, tmp_dir) action.run() # Do replacements in temp file if conf_file: f = os.path.join(tmp_dir, conf_file) for name, value in params.iteritems(): write_in_props_file(f, name, value, True) else: temp_conf_files = [ os.path.join(tmp_dir, f) for f in os.listdir(tmp_dir) ] for name, value in params.iteritems(): for f in temp_conf_files: if write_in_props_file(f, name, value): break else: # Property not found - add it in SPARK_CONF_FILE logger.info("Parameter with name " + name + " has not " "been found in any conf file. Setting it " "in " + default_file) f = os.path.join(tmp_dir, default_file) write_in_props_file(f, name, value, True) # Copy back the files to all hosts self._copy_conf(tmp_dir, hosts)
def _set_common_params(self, params, conf_dir, default_tuning=False): """Replace common parameters. Some user-specified values are overwritten. Args: params (str): Already defined parameters over all the clusters. conf_dir (str): The path of the directory with the configuration files. default_tuning (bool, optional): Whether to use automatic tuning based on some best practices or leave the default parameters. """ defs_file = conf_dir + "/spark-defaults.conf" # spark-env.sh command = "cat >> " + self.conf_dir + "/spark-env.sh << EOF\n" command += "SPARK_MASTER_PORT=" + str(self.port) + "\n" command += "EOF\n" action = Remote(command, self.hosts) action.run() # Get already set parameters global_params = params["global"] exec_mem = global_params["exec_mem"] exec_cores = global_params["exec_cores"] total_execs = global_params["total_execs"] # Log parameters if self.evs_log_dir: write_in_props_file(defs_file, "spark.eventLog.enabled", "true", create_if_absent=True, override=True) write_in_props_file(defs_file, "spark.eventLog.dir", self.evs_log_dir, create_if_absent=True, override=True) write_in_props_file(defs_file, "spark.logConf", "true", create_if_absent=True, override=False) if default_tuning: write_in_props_file(defs_file, "spark.executor.memory", "%dm" % exec_mem, create_if_absent=True, override=False) write_in_props_file(defs_file, "spark.executor.cores", exec_cores, create_if_absent=True, override=False) write_in_props_file(defs_file, "spark.executor.instances", total_execs, create_if_absent=True, override=False)