def __setup_path( self ): globally = True if globally: with remote_open( '/etc/environment', use_sudo=True ) as f: new_path = [ fmt( '{install_dir}/{package}/bin' ) for package in ('spark', 'hadoop') ] self.__patch_etc_environment( f, new_path ) else: for _user in (user, self.admin_account( )): with settings( user=_user ): with remote_open( '~/.profile' ) as f: f.write( '\n' ) for package in ('spark', 'hadoop'): # We don't include sbin here because too many file names collide in # Spark's and Hadoop's sbin f.write( fmt( 'PATH="$PATH:{install_dir}/{package}/bin"\n' ) )
def __setup_ssh_config(self): with remote_open('/etc/ssh/ssh_config', use_sudo=True) as f: f.write( heredoc(""" Host spark-master CheckHostIP no HashKnownHosts no"""))
def __setup_path( self ): globally = True if globally: with remote_open( '/etc/environment', use_sudo=True ) as f: new_path = [ fmt( '{install_dir}/{package}/bin' ) for package in ('spark', 'hadoop') ] self._patch_etc_environment( f, dirs=new_path ) else: for _user in (user, self.admin_account( )): with settings( user=_user ): with remote_open( '~/.profile' ) as f: f.write( '\n' ) for package in ('spark', 'hadoop'): # We don't include sbin here because too many file names collide in # Spark's and Hadoop's sbin f.write( fmt( 'PATH="$PATH:{install_dir}/{package}/bin"\n' ) )
def __install_spark(self): # Download and extract Spark path = fmt( 'spark/spark-{spark_version}/spark-{spark_version}-bin-hadoop{spark_hadoop_version}.tgz' ) self._install_apache_package(path, install_dir) spark_dir = var_dir + "/spark" # Add environment variables to spark_env.sh spark_env_sh_path = fmt("{install_dir}/spark/conf/spark-env.sh") sudo(fmt("cp {spark_env_sh_path}.template {spark_env_sh_path}")) spark_env = dict( SPARK_LOG_DIR=self._lazy_mkdir(log_dir, "spark"), SPARK_WORKER_DIR=self._lazy_mkdir(spark_dir, "work"), SPARK_LOCAL_DIRS=self._lazy_mkdir(spark_dir, "local"), JAVA_HOME='/usr/lib/jvm/java-8-oracle', SPARK_MASTER_IP='spark-master', HADOOP_CONF_DIR=fmt("{install_dir}/hadoop/etc/hadoop")) with remote_open(spark_env_sh_path, use_sudo=True) as spark_env_sh: spark_env_sh.write('\n') for name, value in spark_env.iteritems(): spark_env_sh.write(fmt('export {name}="{value}"\n')) # Configure Spark properties spark_defaults = { 'spark.eventLog.enabled': 'true', 'spark.eventLog.dir': self._lazy_mkdir(spark_dir, "history"), 'spark.master': 'spark://spark-master:7077' } spark_defaults_conf_path = fmt( "{install_dir}/spark/conf/spark-defaults.conf") sudo( fmt("cp {spark_defaults_conf_path}.template {spark_defaults_conf_path}" )) with remote_open(spark_defaults_conf_path, use_sudo=True) as spark_defaults_conf: for name, value in spark_defaults.iteritems(): spark_defaults_conf.write(fmt("{name}\t{value}\n")) # Make shell auto completion easier sudo(fmt('find {install_dir}/spark -name "*.cmd" | xargs rm')) # Install upstart jobs self.__register_upstart_jobs(spark_services)
def __install_yarn(self): # Download and extract Hadoop path = fmt( 'hadoop/common/hadoop-{hadoop_version}/hadoop-{hadoop_version}.tar.gz' ) self._install_apache_package(path, install_dir) # patch path with remote_open('/etc/environment', use_sudo=True) as f: yarn_path = fmt('{install_dir}/hadoop') self._patch_etc_environment(f, env_pairs=dict(HADOOP_HOME=yarn_path))
def __install_spark( self ): # Download and extract Spark path = fmt( 'spark/spark-{spark_version}/spark-{spark_version}-bin-hadoop{spark_hadoop_version}.tgz' ) self._install_apache_package( path, install_dir ) spark_dir = var_dir + "/spark" # Add environment variables to spark_env.sh spark_env_sh_path = fmt( "{install_dir}/spark/conf/spark-env.sh" ) sudo( fmt( "cp {spark_env_sh_path}.template {spark_env_sh_path}" ) ) spark_env = dict( SPARK_LOG_DIR=self._lazy_mkdir( log_dir, "spark" ), SPARK_WORKER_DIR=self._lazy_mkdir( spark_dir, "work" ), SPARK_LOCAL_DIRS=self._lazy_mkdir( spark_dir, "local" ), JAVA_HOME='/usr/lib/jvm/java-8-oracle', SPARK_MASTER_IP='spark-master', HADOOP_CONF_DIR=fmt( "{install_dir}/hadoop/etc/hadoop" ), SPARK_PUBLIC_DNS="$(curl -s http://169.254.169.254/latest/meta-data/public-hostname)" ) with remote_open( spark_env_sh_path, use_sudo=True ) as spark_env_sh: spark_env_sh.write( '\n' ) for name, value in spark_env.iteritems( ): spark_env_sh.write( fmt( 'export {name}="{value}"\n' ) ) # Configure Spark properties spark_defaults = { 'spark.eventLog.enabled': 'true', 'spark.eventLog.dir': self._lazy_mkdir( spark_dir, "history" ), 'spark.master': 'spark://spark-master:7077' } spark_defaults_conf_path = fmt( "{install_dir}/spark/conf/spark-defaults.conf" ) sudo( fmt( "cp {spark_defaults_conf_path}.template {spark_defaults_conf_path}" ) ) with remote_open( spark_defaults_conf_path, use_sudo=True ) as spark_defaults_conf: for name, value in spark_defaults.iteritems( ): spark_defaults_conf.write( fmt( "{name}\t{value}\n" ) ) # Make shell auto completion easier sudo( fmt( 'find {install_dir}/spark -name "*.cmd" | xargs rm' ) ) # Install upstart jobs self.__register_upstart_jobs( spark_services )
def __install_hadoop(self): # Download and extract Hadoop path = fmt( 'hadoop/common/hadoop-{hadoop_version}/hadoop-{hadoop_version}.tar.gz' ) self._install_apache_package(path, install_dir) # Add environment variables to hadoop_env.sh hadoop_env = dict(HADOOP_LOG_DIR=self._lazy_mkdir(log_dir, "hadoop"), JAVA_HOME='/usr/lib/jvm/java-8-oracle') hadoop_env_sh_path = fmt( "{install_dir}/hadoop/etc/hadoop/hadoop-env.sh") with remote_open(hadoop_env_sh_path, use_sudo=True) as hadoop_env_sh: hadoop_env_sh.write('\n') for name, value in hadoop_env.iteritems(): hadoop_env_sh.write(fmt('export {name}="{value}"\n')) # Configure HDFS hdfs_dir = var_dir + "/hdfs" put(use_sudo=True, remote_path=fmt('{install_dir}/hadoop/etc/hadoop/hdfs-site.xml'), local_path=StringIO( self.__to_hadoop_xml_config({ 'dfs.replication': str(hdfs_replication), 'dfs.permissions': 'false', 'dfs.name.dir': self._lazy_mkdir(hdfs_dir, 'name', persistent=True), 'dfs.data.dir': self._lazy_mkdir(hdfs_dir, 'data', persistent=True), 'fs.checkpoint.dir': self._lazy_mkdir(hdfs_dir, 'checkpoint', persistent=True), 'dfs.namenode.http-address': 'spark-master:50070', 'dfs.namenode.secondary.http-address': 'spark-master:50090' }))) # Configure Hadoop put(use_sudo=True, remote_path=fmt('{install_dir}/hadoop/etc/hadoop/core-site.xml'), local_path=StringIO( self.__to_hadoop_xml_config( {'fs.default.name': 'hdfs://spark-master:8020'}))) # Make shell auto completion easier sudo(fmt('find {install_dir}/hadoop -name "*.cmd" | xargs rm')) # Install upstart jobs self.__register_upstart_jobs(hadoop_services)
def __install_spark(self): # Download and extract Spark path = fmt( 'spark/spark-{spark_version}/spark-{spark_version}-bin-hadoop{spark_hadoop_version}.tgz' ) self._install_apache_package(path, install_dir) # Patch paths with remote_open('/etc/environment', use_sudo=True) as f: spark_home = fmt('{install_dir}/spark') # These two PYTHONPATH entries are also added by the 'pyspark' wrapper script. # We need to replicate them globally because we want to be able to just do # 'import pyspark' in Toil's Spark service code and associated tests. python_path = [ fmt('{spark_home}/python'), run(fmt('ls {spark_home}/python/lib/py4j-*-src.zip').strip()) ] self._patch_etc_environment(f, env_pairs=dict(SPARK_HOME=spark_home), dirs=python_path, dirs_var='PYTHONPATH')
def __install_hadoop( self ): # Download and extract Hadoop path = fmt( 'hadoop/common/hadoop-{hadoop_version}/hadoop-{hadoop_version}.tar.gz' ) self.__install_apache_package( path ) # Add environment variables to hadoop_env.sh hadoop_env = dict( HADOOP_LOG_DIR=self._lazy_mkdir( log_dir, "hadoop" ), JAVA_HOME='/usr/lib/jvm/java-7-oracle' ) hadoop_env_sh_path = fmt( "{install_dir}/hadoop/etc/hadoop/hadoop-env.sh" ) with remote_open( hadoop_env_sh_path, use_sudo=True ) as hadoop_env_sh: hadoop_env_sh.write( '\n' ) for name, value in hadoop_env.iteritems( ): hadoop_env_sh.write( fmt( 'export {name}="{value}"\n' ) ) # Configure HDFS hdfs_dir = var_dir + "/hdfs" put( use_sudo=True, remote_path=fmt( '{install_dir}/hadoop/etc/hadoop/hdfs-site.xml' ), local_path=StringIO( self.__to_hadoop_xml_config( { 'dfs.replication': str( hdfs_replication ), 'dfs.permissions': 'false', 'dfs.name.dir': self._lazy_mkdir( hdfs_dir, 'name', persistent=True ), 'dfs.data.dir': self._lazy_mkdir( hdfs_dir, 'data', persistent=True ), 'fs.checkpoint.dir': self._lazy_mkdir( hdfs_dir, 'checkpoint', persistent=True ), 'dfs.namenode.http-address': 'spark-master:50070', 'dfs.namenode.secondary.http-address': 'spark-master:50090' } ) ) ) # Configure Hadoop put( use_sudo=True, remote_path=fmt( '{install_dir}/hadoop/etc/hadoop/core-site.xml' ), local_path=StringIO( self.__to_hadoop_xml_config( { 'fs.default.name': 'hdfs://spark-master:8020' } ) ) ) # Make shell auto completion easier sudo( fmt( 'find {install_dir}/hadoop -name "*.cmd" | xargs rm' ) ) # Install upstart jobs self.__register_upstart_jobs( hadoop_services )
def __setup_ssh_config( self ): with remote_open( '/etc/ssh/ssh_config', use_sudo=True ) as f: f.write( heredoc( """ Host spark-master CheckHostIP no HashKnownHosts no""" ) )