def start_cluster(self, cluster): nn_instance = utils.get_namenode(cluster) datanodes = utils.get_datanodes(cluster) jt_instance = utils.get_jobtracker(cluster) tasktrackers = utils.get_tasktrackers(cluster) oozie = utils.get_oozie(cluster) with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_process(r, "namenode") snns = utils.get_secondarynamenodes(cluster) if snns: for snn in snns: run.start_process(remote.get_remote(snn), "secondarynamenode") for dn in datanodes: run.start_process(remote.get_remote(dn), "datanode") LOG.info("HDFS service at '%s' has been started", nn_instance.hostname) if jt_instance: run.start_process(remote.get_remote(jt_instance), "jobtracker") for tt in tasktrackers: run.start_process(remote.get_remote(tt), "tasktracker") LOG.info("MapReduce service at '%s' has been started", jt_instance.hostname) if oozie: with remote.get_remote(oozie) as r: run.oozie_share_lib(r, nn_instance.hostname) run.start_oozie(r) LOG.info("Oozie service at '%s' has been started", nn_instance.hostname) LOG.info('Cluster %s has been started successfully' % cluster.name) self._set_cluster_info(cluster)
def scale_cluster(cluster, instances): scale_ins_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)] to_scale_dn = [] to_scale_tt = [] for i in scale_ins_hosts: if i in dn_hosts: to_scale_dn.append(i) if i in tt_hosts: to_scale_tt.append(i) mng_ip = u.get_instance(cluster, 'manager').management_ip client = c.IntelClient(mng_ip, cluster.name) rack = '/Default' client.nodes.add(scale_ins_hosts, rack, 'hadoop', cluster.extra['manager_authzkeyfile_path']) client.cluster.install_software(scale_ins_hosts) if to_scale_tt: client.services.mapred.add_nodes('TaskTracker', to_scale_tt) if to_scale_dn: client.services.hdfs.add_nodes('DataNode', to_scale_dn) client.nodes.config() if to_scale_dn: client.services.hdfs.start() if to_scale_tt: client.services.mapred.start()
def scale_cluster(cluster, instances): scale_ins_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)] to_scale_dn = [] to_scale_tt = [] for i in scale_ins_hosts: if i in dn_hosts: to_scale_dn.append(i) if i in tt_hosts: to_scale_tt.append(i) client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name) rack = '/Default' client.nodes.add(scale_ins_hosts, rack, 'hadoop', '/home/hadoop/.ssh/id_rsa') client.cluster.install_software(scale_ins_hosts) if to_scale_tt: client.services.mapred.add_nodes('TaskTracker', to_scale_tt) if to_scale_dn: client.services.hdfs.add_nodes('DataNode', to_scale_dn) client.nodes.config() if to_scale_dn: client.services.hdfs.start() if to_scale_tt: client.services.mapred.start()
def start_cluster(self, cluster): nn_instance = utils.get_namenode(cluster) datanodes = utils.get_datanodes(cluster) jt_instance = utils.get_jobtracker(cluster) tasktrackers = utils.get_tasktrackers(cluster) oozie = utils.get_oozie(cluster) hive_server = utils.get_hiveserver(cluster) with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_process(r, "namenode") snns = utils.get_secondarynamenodes(cluster) if snns: for snn in snns: run.start_process(remote.get_remote(snn), "secondarynamenode") for dn in datanodes: run.start_process(remote.get_remote(dn), "datanode") LOG.info("HDFS service at '%s' has been started", nn_instance.hostname) if jt_instance: run.start_process(remote.get_remote(jt_instance), "jobtracker") for tt in tasktrackers: run.start_process(remote.get_remote(tt), "tasktracker") LOG.info("MapReduce service at '%s' has been started", jt_instance.hostname) if oozie: with remote.get_remote(oozie) as r: if c_helper.is_mysql_enable(cluster): run.mysql_start(r, oozie) run.oozie_create_db(r) run.oozie_share_lib(r, nn_instance.hostname) run.start_oozie(r) LOG.info("Oozie service at '%s' has been started", nn_instance.hostname) if hive_server: with remote.get_remote(nn_instance) as r: run.hive_create_warehouse_dir(r) if c_helper.is_mysql_enable(cluster): with remote.get_remote(hive_server) as h: if not oozie or hive_server.hostname != oozie.hostname: run.mysql_start(h, hive_server) run.hive_create_db(h) run.hive_metastore_start(h) LOG.info("Hive Metastore server at %s has been started", hive_server.hostname) LOG.info('Cluster %s has been started successfully' % cluster.name) self._set_cluster_info(cluster)
def _push_configs_to_nodes(self, cluster, instances=None): extra = self._extract_configs_to_extra(cluster) if instances is None: instances = utils.get_instances(cluster) for inst in instances: ng_extra = extra[inst.node_group.id] files = { '/etc/hadoop/core-site.xml': ng_extra['xml']['core-site'], '/etc/hadoop/mapred-site.xml': ng_extra['xml']['mapred-site'], '/etc/hadoop/hdfs-site.xml': ng_extra['xml']['hdfs-site'], '/tmp/savanna-hadoop-init.sh': ng_extra['setup_script'] } with remote.get_remote(inst) as r: # TODO(aignatov): sudo chown is wrong solution. But it works. r.execute_command( 'sudo chown -R $USER:$USER /etc/hadoop' ) r.execute_command( 'sudo chown -R $USER:$USER /opt/oozie/conf' ) r.write_files_to(files) r.execute_command( 'sudo chmod 0500 /tmp/savanna-hadoop-init.sh' ) r.execute_command( 'sudo /tmp/savanna-hadoop-init.sh ' '>> /tmp/savanna-hadoop-init.log 2>&1') nn = utils.get_namenode(cluster) jt = utils.get_jobtracker(cluster) with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.incl', utils. generate_fqdn_host_names( utils.get_datanodes(cluster))) if jt: with remote.get_remote(jt) as r: r.write_file_to('/etc/hadoop/tt.incl', utils. generate_fqdn_host_names( utils.get_tasktrackers(cluster))) oozie = utils.get_oozie(cluster) if oozie: with remote.get_remote(oozie) as r: r.write_file_to('/opt/oozie/conf/oozie-site.xml', extra[oozie.node_group.id] ['xml']['oozie-site'])
def _configure_services(client, cluster): nn_host = u.get_namenode(cluster).fqdn() snn = u.get_secondarynamenodes(cluster) snn_host = snn[0].fqdn() if snn else None jt_host = u.get_jobtracker(cluster).fqdn() if u.get_jobtracker( cluster) else None dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)] oozie_host = u.get_oozie(cluster).fqdn() if u.get_oozie(cluster) else None hive_host = u.get_hiveserver(cluster).fqdn() if u.get_hiveserver( cluster) else None services = [] if u.get_namenode(cluster): services += ['hdfs'] if u.get_jobtracker(cluster): services += ['mapred'] if oozie_host: services += ['oozie'] services += ['pig'] if hive_host: services += ['hive'] LOG.debug("Add services: %s" % ', '.join(services)) client.services.add(services) LOG.debug("Assign roles to hosts") client.services.hdfs.add_nodes('PrimaryNameNode', [nn_host]) client.services.hdfs.add_nodes('DataNode', dn_hosts) if snn: client.services.hdfs.add_nodes('SecondaryNameNode', [snn_host]) if oozie_host: client.services.oozie.add_nodes('Oozie', [oozie_host]) if hive_host: client.services.hive.add_nodes('HiveServer', [hive_host]) if jt_host: client.services.mapred.add_nodes('JobTracker', [jt_host]) client.services.mapred.add_nodes('TaskTracker', tt_hosts)
def _configure_services(client, cluster): nn_host = u.get_namenode(cluster).fqdn() snn = u.get_secondarynamenodes(cluster) snn_host = snn[0].fqdn() if snn else None jt_host = u.get_jobtracker(cluster).fqdn() dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)] oozie_host = u.get_oozie(cluster).fqdn() if u.get_oozie( cluster) else None hive_host = u.get_hiveserver(cluster).fqdn() if u.get_hiveserver( cluster) else None services = [] if u.get_namenode(cluster): services += ['hdfs'] if u.get_jobtracker(cluster): services += ['mapred'] if oozie_host: services += ['oozie'] services += ['pig'] if hive_host: services += ['hive'] LOG.debug("Add services: %s" % ', '.join(services)) client.services.add(services) LOG.debug("Assign roles to hosts") client.services.hdfs.add_nodes('PrimaryNameNode', [nn_host]) client.services.hdfs.add_nodes('DataNode', dn_hosts) if snn: client.services.hdfs.add_nodes('SecondaryNameNode', [snn_host]) if oozie_host: client.services.oozie.add_nodes('Oozie', [oozie_host]) if hive_host: client.services.hive.add_nodes('HiveServer', [hive_host]) client.services.mapred.add_nodes('JobTracker', [jt_host]) client.services.mapred.add_nodes('TaskTracker', tt_hosts)
def decommission_nodes(self, cluster, instances): tts = utils.get_tasktrackers(cluster) dns = utils.get_datanodes(cluster) decommission_dns = False decommission_tts = False for i in instances: if 'datanode' in i.node_group.node_processes: dns.remove(i) decommission_dns = True if 'tasktracker' in i.node_group.node_processes: tts.remove(i) decommission_tts = True nn = utils.get_namenode(cluster) jt = utils.get_jobtracker(cluster) if decommission_tts: sc.decommission_tt(jt, instances, tts) if decommission_dns: sc.decommission_dn(nn, instances, dns)
def _push_jobtracker_configs(self, cluster, r): r.write_file_to( '/etc/hadoop/tt.incl', utils.generate_fqdn_host_names(utils.get_tasktrackers(cluster)))
def _push_jobtracker_configs(self, cluster, r): r.write_file_to('/etc/hadoop/tt.incl', utils.generate_fqdn_host_names( utils.get_tasktrackers(cluster)))
def decommission_nodes(cluster, instances): dec_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)] client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name) dec_dn_hosts = [] for dec_host in dec_hosts: if dec_host in dn_hosts: dec_dn_hosts.append(dec_host) if dec_dn_hosts: client.services.hdfs.decommission_nodes(dec_dn_hosts) #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 14400 # 4 hours cur_time = 0 for host in dec_dn_hosts: while cur_time < timeout: if client.services.hdfs.get_datanode_status( host) == 'Decomissioned': break context.sleep(5) cur_time += 5 else: LOG.warn("Failed to decomission node '%s' of cluster '%s' " "in %s minutes" % (host, cluster.name, timeout / 60)) client.nodes.stop(dec_hosts) # wait stop services #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 # 10 minutes cur_time = 0 for instance in instances: while cur_time < timeout: stopped = True if instance.fqdn() in dn_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-datanode status', raise_when_error=False) if out.strip() != 'datanode is stopped': stopped = False if out.strip() == 'datanode dead but pid file exists': instance.remote().execute_command( 'sudo rm -f ' '/var/run/hadoop/hadoop-hadoop-datanode.pid') if instance.fqdn() in tt_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-tasktracker status', raise_when_error=False) if out.strip() != 'tasktracker is stopped': stopped = False if stopped: break else: context.sleep(5) cur_time += 5 else: LOG.warn("Failed to stop services on node '%s' of cluster '%s' " "in %s minutes" % (instance, cluster.name, timeout / 60)) for node in dec_hosts: LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name)) client.nodes.delete(node)
def decommission_nodes(cluster, instances): dec_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)] mng_ip = u.get_instances(cluster, 'manager')[0].management_ip client = c.IntelClient(mng_ip, cluster.name) dec_dn_hosts = [] for dec_host in dec_hosts: if dec_host in dn_hosts: dec_dn_hosts.append(dec_host) if dec_dn_hosts: client.services.hdfs.decommission_nodes(dec_dn_hosts) #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 14400 # 4 hours cur_time = 0 for host in dec_dn_hosts: while cur_time < timeout: if client.services.hdfs.get_datanode_status( host) == 'Decomissioned': break context.sleep(5) cur_time += 5 else: LOG.warn("Failed to decomission node '%s' of cluster '%s' " "in %s minutes" % (host, cluster.name, timeout/60)) client.nodes.stop(dec_hosts) # wait stop services #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 # 10 minutes cur_time = 0 for instance in instances: while cur_time < timeout: stopped = True if instance.fqdn() in dn_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-datanode status', raise_when_error=False) if out.strip() != 'datanode is stopped': stopped = False if out.strip() == 'datanode dead but pid file exists': instance.remote().execute_command( 'sudo rm -f ' '/var/run/hadoop/hadoop-hadoop-datanode.pid') if instance.fqdn() in tt_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-tasktracker status', raise_when_error=False) if out.strip() != 'tasktracker is stopped': stopped = False if stopped: break else: context.sleep(5) cur_time += 5 else: LOG.warn("Failed to stop services on node '%s' of cluster '%s' " "in %s minutes" % (instance, cluster.name, timeout/60)) for node in dec_hosts: LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name)) client.nodes.delete(node)