def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): print("[{h}] Installing HDFS...".format( h=ssh_client.get_transport().getpeername()[0])) with ssh_client.open_sftp() as sftp: sftp.put( localpath=os.path.join(SCRIPTS_DIR, 'download-hadoop.py'), remotepath='/tmp/download-hadoop.py') ssh_check_output( client=ssh_client, command=""" set -e python /tmp/download-hadoop.py "{version}" mkdir "hadoop" mkdir "hadoop/conf" tar xzf "hadoop-{version}.tar.gz" -C "hadoop" --strip-components=1 rm "hadoop-{version}.tar.gz" """.format(version=self.version))
def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): print("[{h}] Installing Spark...".format( h=ssh_client.get_transport().getpeername()[0])) try: if self.version: with ssh_client.open_sftp() as sftp: sftp.put( localpath=os.path.join(SCRIPTS_DIR, 'install-spark.sh'), remotepath='/tmp/install-spark.sh') sftp.chmod(path='/tmp/install-spark.sh', mode=0o755) url = self.download_source.format(v=self.version) ssh_check_output( client=ssh_client, command=""" set -e /tmp/install-spark.sh {url} rm -f /tmp/install-spark.sh """.format(url=shlex.quote(url))) else: ssh_check_output( client=ssh_client, command=""" set -e sudo yum install -y git sudo yum install -y java-devel """) ssh_check_output( client=ssh_client, command=""" set -e git clone {repo} spark cd spark git reset --hard {commit} if [ -e "make-distribution.sh" ]; then ./make-distribution.sh -Phadoop-2.6 else ./dev/make-distribution.sh -Phadoop-2.6 fi """.format( repo=shlex.quote(self.git_repository), commit=shlex.quote(self.git_commit))) ssh_check_output( client=ssh_client, command=""" set -e for f in $(find spark/bin -type f -executable -not -name '*.cmd'); do sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)" done echo "export SPARK_HOME='$(pwd)/spark'" >> .bashrc """) except Exception as e: # TODO: This should be a more specific exception. print("Error: Failed to install Spark.", file=sys.stderr) print(e, file=sys.stderr) raise
def install(self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): logger.info("[{h}] Installing HDFS...".format( h=ssh_client.get_transport().getpeername()[0])) with ssh_client.open_sftp() as sftp: sftp.put(localpath=os.path.join(SCRIPTS_DIR, 'download-hadoop.py'), remotepath='/tmp/download-hadoop.py') ssh_check_output(client=ssh_client, command=""" set -e python /tmp/download-hadoop.py "{version}" "{download_source}" mkdir "hadoop" mkdir "hadoop/conf" tar xzf "hadoop-{version}.tar.gz" -C "hadoop" --strip-components=1 rm "hadoop-{version}.tar.gz" for f in $(find hadoop/bin -type f -executable -not -name '*.cmd'); do sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)" done echo "export HADOOP_LIBEXEC_DIR='$(pwd)/hadoop/libexec'" >> .bashrc """.format(version=self.version, download_source=self.download_source))
def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster, ): logger.info("[{h}] Installing HDFS...".format( h=ssh_client.get_transport().getpeername()[0])) with ssh_client.open_sftp() as sftp: sftp.put(localpath=os.path.join(SCRIPTS_DIR, 'download-package.py'), remotepath='/tmp/download-package.py') logger.debug("[{h}] Downloading Hadoop from: {s}".format( h=ssh_client.get_transport().getpeername()[0], s=self.download_source, )) ssh_check_output( client=ssh_client, command=""" set -e python /tmp/download-package.py "{download_source}" "hadoop" for f in $(find hadoop/bin -type f -executable -not -name '*.cmd'); do sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)" done echo "export HADOOP_LIBEXEC_DIR='$(pwd)/hadoop/libexec'" >> .bashrc """.format( # version=self.version, download_source=self.download_source.format(v=self.version), ))
def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): logger.info("[{h}] Installing HDFS...".format( h=ssh_client.get_transport().getpeername()[0])) with ssh_client.open_sftp() as sftp: sftp.put( localpath=os.path.join(SCRIPTS_DIR, 'download-package.py'), remotepath='/tmp/download-package.py') ssh_check_output( client=ssh_client, command=""" set -e python /tmp/download-package.py "{download_source}" "hadoop" for f in $(find hadoop/bin -type f -executable -not -name '*.cmd'); do sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)" done echo "export HADOOP_LIBEXEC_DIR='$(pwd)/hadoop/libexec'" >> .bashrc """.format( version=self.version, download_source=self.download_source.format(v=self.version), ))
def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): print("[{h}] Installing Spark...".format( h=ssh_client.get_transport().getpeername()[0])) try: if self.version: with ssh_client.open_sftp() as sftp: sftp.put( localpath=os.path.join(SCRIPTS_DIR, 'install-spark.sh'), remotepath='/tmp/install-spark.sh') sftp.chmod(path='/tmp/install-spark.sh', mode=0o755) url = self.download_source.format(v=self.version) ssh_check_output( client=ssh_client, command=""" set -e /tmp/install-spark.sh {url} rm -f /tmp/install-spark.sh """.format(url=shlex.quote(url))) else: ssh_check_output( client=ssh_client, command=""" set -e sudo yum install -y git sudo yum install -y java-devel """) ssh_check_output( client=ssh_client, command=""" set -e git clone {repo} spark cd spark git reset --hard {commit} if [ -e "make-distribution.sh" ]; then ./make-distribution.sh -Phadoop-2.6 else ./dev/make-distribution.sh -Phadoop-2.6 fi """.format( repo=shlex.quote(self.git_repository), commit=shlex.quote(self.git_commit))) ssh_check_output( client=ssh_client, command=""" set -e for f in $(find spark/bin -type f -executable -not -name '*.cmd'); do sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)" done echo "export SPARK_HOME='$(pwd)/spark'" >> .bashrc """) except Exception as e: # TODO: This should be a more specific exception. print("Error: Failed to install Spark.", file=sys.stderr) print(e, file=sys.stderr) raise
def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): print("[{h}] Installing HDFS...".format( h=ssh_client.get_transport().getpeername()[0])) with ssh_client.open_sftp() as sftp: sftp.put( localpath=os.path.join(SCRIPTS_DIR, 'download-hadoop.py'), remotepath='/tmp/download-hadoop.py') ssh_check_output( client=ssh_client, command=""" set -e python /tmp/download-hadoop.py "{version}" "{download_source}" mkdir "hadoop" mkdir "hadoop/conf" tar xzf "hadoop-{version}.tar.gz" -C "hadoop" --strip-components=1 rm "hadoop-{version}.tar.gz" for f in $(find hadoop/bin -type f -executable -not -name '*.cmd'); do sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)" done """.format(version=self.version, download_source=self.download_source))
def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): print("[{h}] Installing HDFS...".format( h=ssh_client.get_transport().getpeername()[0])) with ssh_client.open_sftp() as sftp: sftp.put( localpath=os.path.join(SCRIPTS_DIR, 'download-hadoop.py'), remotepath='/tmp/download-hadoop.py') ssh_check_output( client=ssh_client, command=""" set -e python /tmp/download-hadoop.py "{version}" "{download_source}" mkdir "hadoop" mkdir "hadoop/conf" tar xzf "hadoop-{version}.tar.gz" -C "hadoop" --strip-components=1 rm "hadoop-{version}.tar.gz" """.format(version=self.version, download_source=self.download_source))
def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): logger.info("[{h}] Installing Spark...".format( h=ssh_client.get_transport().getpeername()[0])) if self.version: with ssh_client.open_sftp() as sftp: sftp.put( localpath=os.path.join(SCRIPTS_DIR, 'download-package.py'), remotepath='/tmp/download-package.py') ssh_check_output( client=ssh_client, command=""" python /tmp/download-package.py "{download_source}" "spark" """.format( version=self.version, download_source=self.download_source.format(v=self.version), )) else: ssh_check_output( client=ssh_client, command=""" set -e sudo yum install -y git sudo yum install -y java-devel """) ssh_check_output( client=ssh_client, command=""" set -e git clone {repo} spark cd spark git reset --hard {commit} if [ -e "make-distribution.sh" ]; then ./make-distribution.sh -Phadoop-{hadoop_short_version} else ./dev/make-distribution.sh -Phadoop-{hadoop_short_version} fi """.format( repo=shlex.quote(self.git_repository), commit=shlex.quote(self.git_commit), # Hardcoding this here until we figure out a better way to handle # the supported build profiles. hadoop_short_version='2.7', )) ssh_check_output( client=ssh_client, command=""" set -e for f in $(find spark/bin -type f -executable -not -name '*.cmd'); do sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)" done echo "export SPARK_HOME='$(pwd)/spark'" >> .bashrc """)
def setup_node( *, # Change this to take host, user, and identity_file? # Add some kind of caching for SSH connections so that they # can be looked up by host and reused? ssh_client: paramiko.client.SSHClient, services: list, java_version: int, cluster: FlintrockCluster): """ Setup a new node. Cluster methods like provision_node() and add_slaves_node() should delegate the main work of setting up new nodes to this function. """ host = ssh_client.get_transport().getpeername()[0] ssh_check_output(client=ssh_client, command=""" set -e echo {private_key} > "$HOME/.ssh/id_rsa" echo {public_key} >> "$HOME/.ssh/authorized_keys" chmod 400 "$HOME/.ssh/id_rsa" """.format(private_key=shlex.quote(cluster.ssh_key_pair.private), public_key=shlex.quote(cluster.ssh_key_pair.public))) with ssh_client.open_sftp() as sftp: sftp.put(localpath=os.path.join(SCRIPTS_DIR, 'setup-ephemeral-storage.py'), remotepath='/tmp/setup-ephemeral-storage.py') logger.info("[{h}] Configuring ephemeral storage...".format(h=host)) # TODO: Print some kind of warning if storage is large, since formatting # will take several minutes (~4 minutes for 2TB). storage_dirs_raw = ssh_check_output(client=ssh_client, command=""" set -e python /tmp/setup-ephemeral-storage.py rm -f /tmp/setup-ephemeral-storage.py """) storage_dirs = json.loads(storage_dirs_raw) cluster.storage_dirs.root = storage_dirs['root'] cluster.storage_dirs.ephemeral = storage_dirs['ephemeral'] ensure_java(ssh_client, java_version) for service in services: try: service.install( ssh_client=ssh_client, cluster=cluster, ) except Exception as e: raise Exception("Failed to install {}.".format( type(service).__name__)) from e
def install(self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): logger.info("[{h}] Installing Spark...".format( h=ssh_client.get_transport().getpeername()[0])) if self.version: with ssh_client.open_sftp() as sftp: sftp.put(localpath=os.path.join(SCRIPTS_DIR, 'download-package.py'), remotepath='/tmp/download-package.py') ssh_check_output(client=ssh_client, command=""" python /tmp/download-package.py "{download_source}" "spark" """.format( version=self.version, download_source=self.download_source.format( v=self.version), )) else: ssh_check_output(client=ssh_client, command=""" set -e sudo yum install -y git sudo yum install -y java-devel """) ssh_check_output( client=ssh_client, command=""" set -e git clone {repo} spark cd spark git reset --hard {commit} if [ -e "make-distribution.sh" ]; then ./make-distribution.sh -Phadoop-{hadoop_short_version} else ./dev/make-distribution.sh -Phadoop-{hadoop_short_version} fi """.format( repo=shlex.quote(self.git_repository), commit=shlex.quote(self.git_commit), # Hardcoding this here until we figure out a better way to handle # the supported build profiles. hadoop_short_version='2.7', )) ssh_check_output(client=ssh_client, command=""" set -e for f in $(find spark/bin -type f -executable -not -name '*.cmd'); do sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)" done echo "export SPARK_HOME='$(pwd)/spark'" >> .bashrc """)
def setup_node( *, # Change this to take host, user, and identity_file? # Add some kind of caching for SSH connections so that they # can be looked up by host and reused? ssh_client: paramiko.client.SSHClient, services: list, cluster: FlintrockCluster): """ Setup a new node. Cluster methods like provision_node() and add_slaves_node() should delegate the main work of setting up new nodes to this function. """ host = ssh_client.get_transport().getpeername()[0] ssh_check_output( client=ssh_client, command=""" set -e echo {private_key} > "$HOME/.ssh/id_rsa" echo {public_key} >> "$HOME/.ssh/authorized_keys" chmod 400 "$HOME/.ssh/id_rsa" """.format( private_key=shlex.quote(cluster.ssh_key_pair.private), public_key=shlex.quote(cluster.ssh_key_pair.public))) with ssh_client.open_sftp() as sftp: sftp.put( localpath=os.path.join(SCRIPTS_DIR, 'setup-ephemeral-storage.py'), remotepath='/tmp/setup-ephemeral-storage.py') logger.info("[{h}] Configuring ephemeral storage...".format(h=host)) # TODO: Print some kind of warning if storage is large, since formatting # will take several minutes (~4 minutes for 2TB). storage_dirs_raw = ssh_check_output( client=ssh_client, command=""" set -e python /tmp/setup-ephemeral-storage.py rm -f /tmp/setup-ephemeral-storage.py """) storage_dirs = json.loads(storage_dirs_raw) cluster.storage_dirs.root = storage_dirs['root'] cluster.storage_dirs.ephemeral = storage_dirs['ephemeral'] ensure_java8(ssh_client) for service in services: service.install( ssh_client=ssh_client, cluster=cluster)
def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): # TODO: Allow users to specify the Spark "distribution". (?) distribution = 'hadoop2.6' print("[{h}] Installing Spark...".format( h=ssh_client.get_transport().getpeername()[0])) try: if self.version: with ssh_client.open_sftp() as sftp: sftp.put( localpath=os.path.join(SCRIPTS_DIR, 'install-spark.sh'), remotepath='/tmp/install-spark.sh') sftp.chmod(path='/tmp/install-spark.sh', mode=0o755) ssh_check_output( client=ssh_client, command=""" set -e /tmp/install-spark.sh {spark_version} {distribution} rm -f /tmp/install-spark.sh """.format( spark_version=shlex.quote(self.version), distribution=shlex.quote(distribution))) else: ssh_check_output( client=ssh_client, command=""" set -e sudo yum install -y git sudo yum install -y java-devel """) ssh_check_output( client=ssh_client, command=""" set -e git clone {repo} spark cd spark git reset --hard {commit} if [ -e "make-distribution.sh" ]; then ./make-distribution.sh -Phadoop-2.6 else ./dev/make-distribution.sh -Phadoop-2.6 fi """.format( repo=shlex.quote(self.git_repository), commit=shlex.quote(self.git_commit))) except Exception as e: # TODO: This should be a more specific exception. print("Error: Failed to install Spark.", file=sys.stderr) print(e, file=sys.stderr) raise
def uploadFile(sourcePath: str, targetPath: str, sshc: paramiko.client.SSHClient, compress_method: str = None, verbose: bool = True) -> pathlib.Path: def show_progress(filename, size, sent): print(f"Uploading {filename} progress: " + f"{float(sent)/float(size)*100:.2f}%", end="\r") progress = show_progress if verbose else None try: if compress_method: fileName = pathlib.Path(sourcePath).name # change targetPath for uploading to # targetPath's directory / sourcePath's name + ext. targetPath = pathlib.Path( str(pathlib.Path(targetPath).parent / fileName) + "." + compress_method) sourcePath = archiveFile(sourcePath, verbose=verbose, method=compress_method) isArchived = True with scp.SCPClient(sshc.get_transport(), progress=progress) as scpc: # in case Path is PosixPath, casting them to str scpc.put(str(sourcePath), str(targetPath)) print("\n") # nextline if compress_method: unarchiveSSH(targetPath, sshc, method=compress_method, verbose=verbose) isUnarchived = True # change targetPath to uploaded raw file uploadedPath = str(pathlib.Path(targetPath).parent / fileName) finally: # delete archive files if 'isArchived' in locals(): with verbosity_context(f"Deleting archive {sourcePath}", verbose): os.remove(sourcePath) if 'isUnarchived' in locals(): sftp = sshc.open_sftp() with verbosity_context(f"Deleting archive {targetPath} via SCP", verbose): sftp.remove(str(targetPath)) return uploadedPath if compress_method else targetPath