def _validate_existing_ng_scaling(self, cluster, existing): scalable_processes = self._get_scalable_processes() dn_to_delete = 0 for ng in cluster.node_groups: if ng.id in existing: if ng.count > existing[ng.id] and ("datanode" in ng.node_processes): dn_to_delete += ng.count - existing[ng.id] if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Spark plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes)) dn_amount = len(utils.get_instances(cluster, "datanode")) rep_factor = utils.get_config_value_or_default('HDFS', "dfs.replication", cluster) if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor: raise ex.ClusterCannotBeScaled( cluster.name, _("Spark plugin cannot shrink cluster because " "there would be not enough nodes for HDFS " "replicas (replication factor is %s)") % rep_factor)
def validate_job_execution(self, cluster, job, data): if not self.edp_supported(cluster.hadoop_version): raise ex.PluginInvalidDataException( _('Spark {base} or higher required to run {type} jobs').format( base=EdpEngine.edp_base_version, type=job.type)) super(EdpEngine, self).validate_job_execution(cluster, job, data)
def _validate_additional_ng_scaling(self, cluster, additional): scalable_processes = self._get_scalable_processes() for ng_id in additional: ng = utils.get_by_id(cluster.node_groups, ng_id) if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Spark plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes))
def validate(self, cluster): nn_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "namenode")]) if nn_count != 1: raise ex.InvalidComponentCountException("namenode", 1, nn_count) dn_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "datanode")]) if dn_count < 1: raise ex.InvalidComponentCountException("datanode", _("1 or more"), nn_count) rep_factor = utils.get_config_value_or_default('HDFS', "dfs.replication", cluster) if dn_count < rep_factor: raise ex.InvalidComponentCountException( 'datanode', _('%s or more') % rep_factor, dn_count, _('Number of %(dn)s instances should not be less ' 'than %(replication)s') % { 'dn': 'datanode', 'replication': 'dfs.replication' }) # validate Spark Master Node and Spark Slaves sm_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "master")]) if sm_count < 1: raise ex.RequiredServiceMissingException("Spark master") if sm_count >= 2: raise ex.InvalidComponentCountException("Spark master", "1", sm_count) sl_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "slave")]) if sl_count < 1: raise ex.InvalidComponentCountException("Spark slave", _("1 or more"), sl_count)
def await_datanodes(cluster): datanodes_count = len(utils.get_instances(cluster, "datanode")) if datanodes_count < 1: return log_msg = _("Waiting on %d DataNodes to start up") % datanodes_count with utils.get_instance(cluster, "namenode").remote() as r: utils.plugin_option_poll( cluster, _check_datanodes_count, c_helper.DATANODES_STARTUP_TIMEOUT, log_msg, 1, {"remote": r, "count": datanodes_count})
def _push_configs_to_nodes(self, cluster, extra, new_instances): all_instances = utils.get_instances(cluster) utils.add_provisioning_step(cluster.id, _("Push configs to nodes"), len(all_instances)) with context.PluginsThreadGroup() as tg: for instance in all_instances: extra = self._add_instance_ng_related_to_extra( cluster, instance, extra) if instance in new_instances: tg.spawn('spark-configure-%s' % instance.instance_name, self._push_configs_to_new_node, cluster, extra, instance) else: tg.spawn('spark-reconfigure-%s' % instance.instance_name, self._push_configs_to_existing_node, cluster, extra, instance)
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with utils.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(utils.get_remote(nn), "dfsadmin") context.sleep(3) utils.plugin_option_poll(nn.cluster, _is_decommissioned, c_helper.DECOMMISSIONING_TIMEOUT, _("Decommission %s") % "DataNodes", 3, { 'r': r, 'inst_to_be_deleted': inst_to_be_deleted }) r.write_files_to({ '/etc/hadoop/dn.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': "" })
def get_description(self): return _("This plugin provides an ability to launch Spark on Hadoop " "CDH cluster without any management consoles.")
"{split($NF,a,\"/\"); print a[1]}'" "| xargs sudo kill -9")) def start_spark_master(nn_remote, sp_home): nn_remote.execute_command("bash " + os.path.join(sp_home, "sbin/start-all.sh")) def stop_spark(nn_remote, sp_home): nn_remote.execute_command("bash " + os.path.join(sp_home, "sbin/stop-all.sh")) @utils.event_wrapper( True, step=_("Await DataNodes start up"), param=("cluster", 0)) def await_datanodes(cluster): datanodes_count = len(utils.get_instances(cluster, "datanode")) if datanodes_count < 1: return log_msg = _("Waiting on %d DataNodes to start up") % datanodes_count with utils.get_instance(cluster, "namenode").remote() as r: utils.plugin_option_poll( cluster, _check_datanodes_count, c_helper.DATANODES_STARTUP_TIMEOUT, log_msg, 1, {"remote": r, "count": datanodes_count}) def _check_datanodes_count(remote, count): if count < 1:
# implied. # See the License for the specific language governing permissions and # limitations under the License. import os import six from sahara.plugins import context from sahara.plugins import utils from sahara_plugin_spark.i18n import _ from sahara_plugin_spark.plugins.spark import config_helper as c_helper from sahara_plugin_spark.plugins.spark import run_scripts as run @utils.event_wrapper(True, step=_("Decommission %s") % "Slaves") def decommission_sl(master, inst_to_be_deleted, survived_inst): if survived_inst is not None: slavenames = [] for slave in survived_inst: slavenames.append(slave.hostname()) slaves_content = c_helper.generate_spark_slaves_configs(slavenames) else: slaves_content = "\n" cluster = master.cluster sp_home = utils.get_config_value_or_default("Spark", "Spark home", cluster) r_master = utils.get_remote(master) run.stop_spark(r_master, sp_home) # write new slave file to master
"{split($NF,a,\"/\"); print a[1]}'" "| xargs sudo kill -9")) def start_spark_master(nn_remote, sp_home): nn_remote.execute_command("bash " + os.path.join(sp_home, "sbin/start-all.sh")) def stop_spark(nn_remote, sp_home): nn_remote.execute_command("bash " + os.path.join(sp_home, "sbin/stop-all.sh")) @utils.event_wrapper(True, step=_("Await DataNodes start up"), param=("cluster", 0)) def await_datanodes(cluster): datanodes_count = len(utils.get_instances(cluster, "datanode")) if datanodes_count < 1: return log_msg = _("Waiting on %d DataNodes to start up") % datanodes_count with utils.get_instance(cluster, "namenode").remote() as r: utils.plugin_option_poll(cluster, _check_datanodes_count, c_helper.DATANODES_STARTUP_TIMEOUT, log_msg, 1, { "remote": r, "count": datanodes_count })