def check_ssh(): """ Try and SSH into the slaves. Must have SSH keys. """ for slave in get_slaves(): ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) try: ssh.connect(hostname=slave, look_for_keys=True) except paramiko.ssh_exception.AuthenticationException as e: msg = ( "Received an exception: {}" "\nThis means that either:\n" "1. The username on this host does not match the one on {}. " "Please create a user that matches.\n" "2. SSH keys have not been installed. " "Please run ./ssh_key_copy.py to do so. You may need to change " "the usernames in that script to match your own.").format( e, slave) exit_with_msg(msg) except paramiko.ssh_exception.NoValidConnectionsError as e: msg = "" for addr, err in e.errors.items(): msg += "Received '{}' while connecting to {} on port {}.".format( err[1], addr[0], addr[1]) msg += "Check that SSH is enabled on " + slave msg += "\n" exit_with_msg(msg)
def modify_monitoring(self, path): """ Either start or end monitoring. This depends on the path """ #send a request to all the slaves using threads so the time between #each request to the nodes is minimised. for slave in get_slaves(True): t = Thread(target=self.send_request, args=( slave, path, )) t.start()
def check_time_skew(): """ Check the time difference between this host and the slaves """ process = subprocess.Popen(["which", "clockdiff"], stdout=subprocess.PIPE) if process.communicate()[0] == "": msg = "Need clockdiff to calculate time skew." \ "Please install using sudo apt install iputils-clockdiff" exit_with_msg(msg) print("\nCalculating time skew...") print("------------------------") time_threshold = 20000 #20 seconds of time skew allowed above_threshold = False for slave in get_slaves(): process = subprocess.Popen(["clockdiff", "-o", slave], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, _ = process.communicate() time_skew = int(stdout.split()[1]) if abs(time_skew) >= time_threshold: sys.stdout.write("Host:\t" + slave) sys.stdout.write("\033[1;31m") #print it red sys.stdout.write("\tDifference:\t" + str(time_skew) + "ms\n") sys.stdout.write("\033[0;0m") #change back to normal sys.stdout.flush() above_threshold = True else: print("Host:\t{}\tDifference:\t{}ms".format(slave, time_skew)) if above_threshold: msg = "The host(s) displayed in red means that time difference between the host "\ "and the master(this node) is greater than the threshold (20 seconds). "\ "Please consider using NTP to synchronise the time in the cluster." exit_with_msg(msg) print("The time difference between nodes in the cluster is fine." "You may start up hadoop using ./run_dfs.py.")
def check_java_hadoop_slaves(java_vers, hadoop_vers): """ Check the java and hadoop version on all slaves. Uses threads to check slaves in parallel. """ java_lock = threading.Lock() hadoop_lock = threading.Lock() threads = [] print("\nChecking Java and Hadoop versions on the slaves...") for slave in get_slaves(): t = threading.Thread(target=check_slave_vers, args=(slave, java_vers, java_lock, hadoop_vers, hadoop_lock)) t.setDaemon(True) threads.append(t) for t in threads: t.start() for t in threads: t.join()
def check_ping(): """ Ping all the slaves and print the result """ #check ping to all the slaves successful_ping = [] fail_ping = [] unknown_ping = [] for slave in get_slaves(): process = subprocess.Popen(["ping", "-c", "1", slave], stdout=subprocess.PIPE) stdout, _ = process.communicate() success_msg = "1 packets transmitted, 1 received, 0% packet loss" if success_msg in stdout: successful_ping.append(slave) elif "unknown host" in stdout: unknown_ping.append(slave) else: fail_ping.append(slave) if unknown_ping: unknown = ", ".join(unknown_ping) msg = "The following hosts are unknown: " + unknown + ". Please add "\ "these hostnames and the corresponding IP addresses to /etc/hosts" exit_with_msg(msg) if fail_ping: fails = ", ".join(fail_ping) msg = "Cannot ping " + fails + ". Please check connectivity to these nodes." exit_with_msg(msg) if not successful_ping: msg = "Please configure the Hadoop slave file to list the datanodes." \ "This file is typically in /usr/local/hadoop/etc/hadoop/slaves" exit_with_msg(msg) success = ", ".join(successful_ping) print("Sucessfully ping-ed these nodes: " + success)
def run_sudo_command(client, cmd): stdin, stdout, stderr = client.exec_command(cmd, get_pty=True) stdin.write('hduser\n') stdin.flush() data = stdout.read() new_contents = '127.0.0.1\tlocalhost\n#127.0.1.1\thduser\n\n'\ '# The following lines are desirable for IPv6 capable hosts\n' \ '::1\tip6-localhost ip6-loopback\nfe00::0\tip6-localnet\n' \ 'ff00::0\tip6-mcastprefix\nff02::1\tip6-allnodes\n'\ 'ff02::2\tip6-allrouters\nff02::3\tip6-allhosts\n\n' with open('node_ip_hostname.txt', 'r') as f: new_contents += f.read() for slave in get_slaves(): print('Modifying ' + slave) ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(hostname=slave, username='******', look_for_keys=True) cmd = 'echo "{}\n" | sudo tee /etc/hosts'.format(new_contents) run_sudo_command(ssh, cmd) cmd = 'sudo hostname ' + slave run_sudo_command(ssh, cmd) cmd = 'echo "{}" | sudo tee /etc/hostname'.format(slave) run_sudo_command(ssh, cmd)
if snn_host == "": exit_with_msg(snn_err_msg) try: snn_hostname = socket.gethostbyaddr(snn_host)[0] except socket.herror as err: msg = "Received error: " + err[1] + "\n" exit_with_msg(msg + snn_err_msg) jps_to_check = run_cmd("ssh " + snn_hostname + " jps") if "SecondaryNameNode" not in jps_to_check: add_to_list(not_running, snn_hostname, "Secondary Name Node") for slave in get_slaves(False): jps_to_check = run_cmd("ssh " + slave + " jps") if "DataNode" not in jps_to_check: add_to_list(not_running, slave, "Data Node") if "NodeManager" not in jps_to_check: add_to_list(not_running, slave, "Node Manager") if not_running: msg = "Some Hadoop daemons are not running: \n" for host, daemon_list in sorted(not_running.items()): msg += host + ": " msg += ", ".join(daemon_list) msg += "\n" exit_with_msg(msg)
#!/usr/bin/env python import os from get_hadoop_attributes import get_slaves file_list = [ "core-site.xml", "mapred-site.xml", "hdfs-site.xml", "yarn-site.xml" ] hadoop_path = os.getenv("HADOOP_HOME", "/usr/local/hadoop/") path_to_file = os.path.join(hadoop_path, "etc", "hadoop") for f in file_list: for slave in get_slaves(True): print slave cmd = "scp " + path_to_file + "/slave_conf/" + f + " " + slave + ":" + path_to_file os.system(cmd)