def setup_gobblin(self, host, port): ''' Configure Gobblin. Each time something changes (eg) a new Haddop endpoint is present this method must be called. :param str ip: IP of the HDFS endpoint. :param str port: Port of the HDFS endpoint. ''' # Setup the environment gobblin_bin = self.dist_config.path('gobblin') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if gobblin_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], gobblin_bin]) env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin' env['GOBBLIN_WORK_DIR'] = "/user/gobblin/work" hdfs_endpoint = ''.join([host, ':', port]) # Setup gobblin configuration conf_dir = self.dist_config.path('gobblin') / 'conf' gobblin_config_template = conf_dir / 'gobblin-mapreduce.properties.template' gobblin_config = conf_dir / 'gobblin-mapreduce.properties' try: copy(gobblin_config_template, gobblin_config) except FileNotFoundError: pass utils.re_edit_in_place(gobblin_config, { r'fs.uri=hdfs://localhost:8020': 'fs.uri=hdfs://%s' % hdfs_endpoint, }) if '2.7.2' in self.hadoop_version: utils.re_edit_in_place(gobblin_config, { r'task.data.root.dir=*': 'task.data.root.dir=${env:GOBBLIN_WORK_DIR}/task' }, append_non_matches=True)
def setup_zookeeper_config(self): """Setup Zookeeper configuration based on default config. Copy the default configuration files to zookeeper_conf property defined in dist.yaml """ default_conf = self.dist_config.path('zookeeper') / 'conf' zookeeper_conf = self.dist_config.path('zookeeper_conf') zookeeper_conf.rmtree_p() default_conf.copytree(zookeeper_conf) # Now remove the conf included in the tarball and symlink our real conf default_conf.rmtree_p() zookeeper_conf.symlink(default_conf) zoo_cfg = zookeeper_conf / 'zoo.cfg' if not zoo_cfg.exists(): (zookeeper_conf / 'zoo_sample.cfg').copy(zoo_cfg) utils.re_edit_in_place(zoo_cfg, { r'^dataDir.*': 'dataDir={}'.format(self.dist_config.path('zookeeper_data_dir')), }) # Configure zookeeper environment for all users zookeeper_bin = self.dist_config.path('zookeeper') / 'bin' zookeeper_rest = self.dist_config.path('zookeeper') / 'src/contrib/rest' with utils.environment_edit_in_place('/etc/environment') as env: if zookeeper_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], zookeeper_bin]) env['ZOOCFGDIR'] = self.dist_config.path('zookeeper_conf') env['ZOO_BIN_DIR'] = zookeeper_bin env['ZOO_LOG_DIR'] = self.dist_config.path('zookeeper_log_dir') env['ZOO_REST_DIR'] = zookeeper_rest
def setup_gobblin(self, host, port): """ Configure Gobblin. Each time something changes (eg) a new Haddop endpoint is present this method must be called. :param str ip: IP of the HDFS endpoint. :param str port: Port of the HDFS endpoint. """ # Setup the environment gobblin_bin = self.dist_config.path("gobblin") / "bin" with utils.environment_edit_in_place("/etc/environment") as env: if gobblin_bin not in env["PATH"]: env["PATH"] = ":".join([env["PATH"], gobblin_bin]) env["HADOOP_BIN_DIR"] = env["HADOOP_HOME"] + "/bin" env["GOBBLIN_WORK_DIR"] = "/user/gobblin/work" hdfs_endpoint = "".join([host, ":", port]) # Setup gobblin configuration conf_dir = self.dist_config.path("gobblin") / "conf" gobblin_config_template = conf_dir / "gobblin-mapreduce.properties.template" gobblin_config = conf_dir / "gobblin-mapreduce.properties" copy(gobblin_config_template, gobblin_config) utils.re_edit_in_place(gobblin_config, {r"fs.uri=hdfs://localhost:8020": "fs.uri=hdfs://%s" % hdfs_endpoint})
def install(self): self.dist_config.add_users() self.dist_config.add_dirs() jujuresources.install(self.resources['livy'], destination=self.dist_config.path('livy'), skip_top_level=False) livy_bin = self.dist_config.path('livy') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if livy_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], livy_bin]) # Following classpath comes from `hadoop classpath` and should be fixed hadoop_cp = '/etc/hadoop/conf:/usr/lib/hadoop/share/hadoop/common/lib/*:/usr/lib/hadoop/share/hadoop/common/*\ :/usr/lib/hadoop/share/hadoop/hdfs:/usr/lib/hadoop/share/hadoop/hdfs/lib/*\ :/usr/lib/hadoop/share/hadoop/hdfs/*:/usr/lib/hadoop/share/hadoop/yarn/lib/*\ :/usr/lib/hadoop/share/hadoop/yarn/*:/usr/lib/hadoop/share/hadoop/mapreduce/lib/*\ :/usr/lib/hadoop/share/hadoop/mapreduce/*:/usr/lib/hadoop/contrib/capacity-scheduler/*.jar' env['CLASSPATH'] = hadoop_cp cmd = "chown -R {}:hadoop {}".format(self.user, self.dist_config.path('livy')) call(cmd.split()) cmd = "chown -R {}:hadoop {}".format( self.user, self.dist_config.path('livy_conf')) call(cmd.split())
def installoracle(): hookenv.log('Installing Oracle JDK') filesdir = '{}/files/'.format(charm_dir()) conf = hookenv.config() (tarname, dirname) = get_java_paths(filesdir, conf['install-type'], conf['java-major']) destdir = "/opt/java/{}".format(dirname) if not os.path.isdir(destdir): tfile = tarfile.open( '{}/files/{}'.format(charm_dir(), tarname), 'r') # Important to note that the following extraction is # UNSAFE since .tar.gz archive could contain # relative path like ../../ and overwrite other dirs extractdir = '{}/{}'.format(filesdir, dirname) tfile.extractall(filesdir) mergecopytree(extractdir, destdir) # Set defaults subprocess.check_output(['update-alternatives', '--install', '/usr/bin/java', 'java', '{}/jre/bin/java'.format(destdir), '2000']) subprocess.check_output(['update-alternatives', '--install', '/usr/bin/javac', 'javac', '{}/bin/javac'.format(destdir), '2000']) # set env vars with utils.environment_edit_in_place('/etc/environment') as env: # ensure that correct java is used env['JAVA_HOME'] = destdir env['J2SDKDIR'] = destdir env['J2REDIR'] = '{}/jre'.format(destdir) env['DERBY_HOME'] = '{}/db'.format(destdir) if destdir not in env['PATH']: env['PATH'] = ':'.join([ '{}/bin'.format(env['JAVA_HOME']), '{}/bin'.format(env['J2REDIR']), '{}/bin'.format(env['DERBY_HOME']), env['PATH'], ])
def setup_gobblin(self, host, port): ''' Configure Gobblin. Each time something changes (eg) a new Haddop endpoint is present this method must be called. :param str ip: IP of the HDFS endpoint. :param str port: Port of the HDFS endpoint. ''' # Setup the environment gobblin_bin = self.dist_config.path('gobblin') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if gobblin_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], gobblin_bin]) env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin' env['GOBBLIN_WORK_DIR'] = "/user/gobblin/work" hdfs_endpoint = ''.join([host, ':', port]) # Setup gobblin configuration conf_dir = self.dist_config.path('gobblin') / 'conf' gobblin_config_template = conf_dir / 'gobblin-mapreduce.properties.template' gobblin_config = conf_dir / 'gobblin-mapreduce.properties' copy(gobblin_config_template, gobblin_config) utils.re_edit_in_place(gobblin_config, { r'fs.uri=hdfs://localhost:8020': 'fs.uri=hdfs://%s' % hdfs_endpoint, })
def setup_zookeeper_config(self): """ Setup Zookeeper configuration based on default config. Copy the default configuration files to zookeeper_conf property defined in dist.yaml """ default_conf = self.dist_config.path('zookeeper') / 'conf' zookeeper_conf = self.dist_config.path('zookeeper_conf') zookeeper_conf.rmtree_p() default_conf.copytree(zookeeper_conf) # Now remove the conf included in the tarball and symlink our real conf default_conf.rmtree_p() zookeeper_conf.symlink(default_conf) zoo_cfg = zookeeper_conf / 'zoo.cfg' if not zoo_cfg.exists(): (zookeeper_conf / 'zoo_sample.cfg').copy(zoo_cfg) utils.re_edit_in_place(zoo_cfg, { r'^dataDir.*': 'dataDir={}'.format(self.dist_config.path('zookeeper_data_dir')), }) # Configure zookeeper environment for all users zookeeper_bin = self.dist_config.path('zookeeper') / 'bin' zookeeper_rest = self.dist_config.path('zookeeper') / 'src/contrib/rest' with utils.environment_edit_in_place('/etc/environment') as env: if zookeeper_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], zookeeper_bin]) env['ZOOCFGDIR'] = self.dist_config.path('zookeeper_conf') env['ZOO_BIN_DIR'] = zookeeper_bin env['ZOO_LOG_DIR'] = self.dist_config.path('zookeeper_log_dir') env['ZOO_REST'] = zookeeper_rest
def configure_flume(self, template_data=None): ''' handle configuration of Flume and setup the environment ''' render( source='flume.conf.j2', target=self.config_file, context=dict({ 'dist_config': self.dist_config, }, **(template_data or {})), filters={ 'agent_list': lambda agents, prefix='': ','.join([ '%s%s' % (prefix, a['name']) for a in agents ]), }, ) flume_bin = self.dist_config.path('flume') / 'bin' java_symlink = check_output( ["readlink", "-f", "/usr/bin/java"]).decode('utf8') java_home = re.sub('/bin/java', '', java_symlink).rstrip() with utils.environment_edit_in_place('/etc/environment') as env: if flume_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], flume_bin]) env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf') env['FLUME_CLASSPATH'] = self.dist_config.path('flume') / 'lib' env['FLUME_HOME'] = self.dist_config.path('flume') env['JAVA_HOME'] = java_home
def setup_hive_config(self): ''' copy the default configuration files to hive_conf property defined in dist.yaml ''' default_conf = self.dist_config.path('hive') / 'conf' hive_conf = self.dist_config.path('hive_conf') hive_conf.rmtree_p() default_conf.copytree(hive_conf) # Configure immutable bits hive_bin = self.dist_config.path('hive') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if hive_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], hive_bin]) env['HIVE_CONF_DIR'] = self.dist_config.path('hive_conf') hive_env = self.dist_config.path('hive_conf') / 'hive-env.sh' if not hive_env.exists(): (self.dist_config.path('hive_conf') / 'hive-env.sh.template').copy(hive_env) hive_site = self.dist_config.path('hive_conf') / 'hive-site.xml' if not hive_site.exists(): (self.dist_config.path('hive_conf') / 'hive-default.xml.template').copy(hive_site) with utils.xmlpropmap_edit_in_place(hive_site) as props: # TODO (kwm): we should be able to export java.io.tmpdir so these 4 arent needed props['hive.exec.local.scratchdir'] = "/tmp/hive" props['hive.downloaded.resources.dir'] = "/tmp/hive_resources" props['hive.querylog.location'] = "/tmp/hive" props['hive.server2.logging.operation.log.location'] = "/tmp/hive" #### # create hdfs storage space utils.run_as('hive', 'hdfs', 'dfs', '-mkdir', '-p', '/user/hive/warehouse')
def install(self): ''' Fetch resources ''' self.dist_config.add_users() self.dist_config.add_dirs() result = resource_get('tomee') if not result: log("Failed to fetch TomEE resource") return False unitdata.kv().set("tomeetarball", result) log("TomEE tarball path is {}".format(result)) tomee_install_dir = self.dist_config.path('tomee_dir') with chdir(tomee_install_dir): utils.run_as('tomcat', 'tar', '-zxvf', '{}'.format(result)) tomee_dirs = [f for f in os.listdir(tomee_install_dir) if f.startswith('apache-tomee')] catalina_home = os.path.join(tomee_install_dir, tomee_dirs[0]) with utils.environment_edit_in_place('/etc/environment') as env: env['CATALINA_HOME'] = catalina_home unitdata.kv().set("catalina_home", catalina_home) self.open_ports() return True
def configure_hadoop(self): java_home = Path(unitdata.kv().get('java.home')) java_bin = java_home / 'bin' hadoop_bin = self.dist_config.path('hadoop') / 'bin' hadoop_sbin = self.dist_config.path('hadoop') / 'sbin' with utils.environment_edit_in_place('/etc/environment') as env: env['JAVA_HOME'] = java_home if java_bin not in env['PATH']: env['PATH'] = ':'.join([java_bin, env['PATH']]) # ensure that correct java is used if hadoop_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], hadoop_bin]) if hadoop_sbin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], hadoop_sbin]) env['HADOOP_LIBEXEC_DIR'] = self.dist_config.path('hadoop') / 'libexec' env['HADOOP_INSTALL'] = self.dist_config.path('hadoop') env['HADOOP_HOME'] = self.dist_config.path('hadoop') env['HADOOP_COMMON_HOME'] = self.dist_config.path('hadoop') env['HADOOP_HDFS_HOME'] = self.dist_config.path('hadoop') env['HADOOP_MAPRED_HOME'] = self.dist_config.path('hadoop') env['HADOOP_YARN_HOME'] = self.dist_config.path('hadoop') env['YARN_HOME'] = self.dist_config.path('hadoop') env['HADOOP_CONF_DIR'] = self.dist_config.path('hadoop_conf') env['YARN_CONF_DIR'] = self.dist_config.path('hadoop_conf') env['YARN_LOG_DIR'] = self.dist_config.path('yarn_log_dir') env['HDFS_LOG_DIR'] = self.dist_config.path('hdfs_log_dir') env['HADOOP_LOG_DIR'] = self.dist_config.path('hdfs_log_dir') # for hadoop 2.2.0 only env['MAPRED_LOG_DIR'] = '/var/log/hadoop/mapred' # should be moved to config, but could env['MAPRED_PID_DIR'] = '/var/run/hadoop/mapred' # be destructive for mapreduce operation hadoop_env = self.dist_config.path('hadoop_conf') / 'hadoop-env.sh' utils.re_edit_in_place(hadoop_env, { r'export JAVA_HOME *=.*': 'export JAVA_HOME=%s' % java_home, })
def configure(self): ''' Configure spark environment for all users ''' spark_home = self.dist_config.path('spark') spark_bin = spark_home / 'bin' # put our jar in hdfs spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format(spark_home))[0] utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/share/lib') try: utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar, '/user/ubuntu/share/lib/spark-assembly.jar') except CalledProcessError: print ("File exists") # update environment variables with utils.environment_edit_in_place('/etc/environment') as env: if spark_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], spark_bin]) env['MASTER'] = hookenv.config('spark_execution_mode') env['PYSPARK_DRIVER_PYTHON'] = "ipython" env['SPARK_CONF_DIR'] = self.dist_config.path('spark_conf') env['SPARK_HOME'] = spark_home env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # update spark config spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place(spark_conf, { r'.*spark.eventLog.enabled *.*': 'spark.eventLog.enabled true', r'.*spark.eventLog.dir *.*': 'spark.eventLog.dir hdfs:///user/ubuntu/directory', })
def configure_flume(self, template_data=None): ''' handle configuration of Flume and setup the environment ''' render( source='flume.conf.j2', target=self.config_file, context=dict({ 'dist_config': self.dist_config, }, **(template_data or {})), filters={ 'agent_list': lambda agents, prefix='': ','.join( ['%s%s' % (prefix, a['name']) for a in agents]), }, ) flume_bin = self.dist_config.path('flume') / 'bin' java_symlink = check_output(["readlink", "-f", "/usr/bin/java"]).decode('utf8') java_home = re.sub('/bin/java', '', java_symlink).rstrip() with utils.environment_edit_in_place('/etc/environment') as env: if flume_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], flume_bin]) env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf') env['FLUME_CLASSPATH'] = self.dist_config.path('flume') / 'lib' env['FLUME_HOME'] = self.dist_config.path('flume') env['JAVA_HOME'] = java_home
def update_config(self, mode): """ Configure Pig with the correct classpath. If Hadoop is available, use HADOOP_CONF_DIR, otherwise use PIG_HOME. """ with utils.environment_edit_in_place('/etc/environment') as env: key = 'HADOOP_CONF_DIR' if mode == 'mapreduce' else 'PIG_HOME' env['PIG_CLASSPATH'] = env[key]
def setup_hue(self, namenodes, resourcemanagers, hdfs_port, yarn_port, yarn_http, yarn_ipc): hookenv.status_set('maintenance', 'Setting up Hue') hue_bin = self.dist_config.path('hue') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if hue_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], hue_bin]) env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin' env['GOBBLIN_WORK_DIR'] = self.dist_config.path('outputdir') hadoop_conf = env['HADOOP_CONF_DIR'] + '/core-site.xml' yarn_conf = env['HADOOP_CONF_DIR'] + '/yarn-site.xml' mapred_conf = env['HADOOP_CONF_DIR'] + '/mapred-site.xml' with utils.xmlpropmap_edit_in_place(hadoop_conf) as props: hdfs_endpoint = props['fs.defaultFS'] with utils.xmlpropmap_edit_in_place(yarn_conf) as props: yarn_log_url = props['yarn.log.server.url'] # 19888 yarn_resmgr = props['yarn.resourcemanager.address'] # 8032 with utils.xmlpropmap_edit_in_place(mapred_conf) as props: mapred_jobhistory = props['mapreduce.jobhistory.address'] # 10020 default_conf = self.dist_config.path('hue') / 'desktop/conf' hue_conf = self.dist_config.path('hue_conf') if os.path.islink('/usr/lib/hue/desktop/conf'): return else: hue_conf.rmtree_p() default_conf.copytree(hue_conf) # Now remove the conf included in the tarball and symlink our real conf default_conf.rmtree_p() hue_conf.symlink(default_conf) hdfs_fulluri = hdfs_endpoint.split('/')[2] hdfs_hostname = hdfs_fulluri.split(':')[0] hue_config = ''.join((self.dist_config.path('hue'), '/desktop/conf/hue.ini')) hue_port = self.dist_config.port('hue_web') # Fix following for HA: http://docs.hortonworks.com/HDPDocuments/HDP2/HDP-2.3.0/bk_hadoop-ha/content/ha-nn-deploy-hue.html hookenv.log("Not currently supporting HA, FIX: namenodes are: " + str(namenodes) + " resmanagers: " + str(resourcemanagers)) utils.re_edit_in_place(hue_config, { r'http_port=8888': 'http_port=%s' % hue_port, #r'fs_defaultfs=hdfs://localhost:8020': 'fs_defaultfs=%s' % hdfs_endpoint, r'fs_defaultfs=hdfs://localhost:8020': 'fs_defaultfs=%s:%s' % (namenodes[0], hdfs_port), #r'## resourcemanager_host=localhost': 'resourcemanager_host=%s' % yarn_resmgr.split(':')[0], r'.*resourcemanager_host=localhost': 'resourcemanager_host=%s' % resourcemanagers[0], #r'## resourcemanager_port=8032': 'resourcemanager_port=%s' % yarn_resmgr.split(':')[1], r'.*resourcemanager_port=8032': 'resourcemanager_port=%s' % yarn_port, r'.*webhdfs_url=http://localhost:50070/webhdfs/v1': 'webhdfs_url=http://%s:50070/webhdfs/v1' % namenodes[0], r'.*history_server_api_url=http://localhost:19888': 'history_server_api_url=%s' % yarn_log_url.split('/')[0], r'.*resourcemanager_api_url=http://localhost:8088': 'resourcemanager_api_url=http://%s:8088' % yarn_resmgr.split(':')[0], r'.*secret_key=.*': 'secret_key=%s' % uuid.uuid4() }) self.update_apps()
def configure_zeppelin(self): ''' Configure zeppelin environment for all users ''' zeppelin_bin = self.dist_config.path('zeppelin') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if zeppelin_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], zeppelin_bin]) env['ZEPPELIN_CONF_DIR'] = self.dist_config.path('zeppelin_conf') zeppelin_site = self.dist_config.path( 'zeppelin_conf') / 'zeppelin-site.xml' with utils.xmlpropmap_edit_in_place(zeppelin_site) as xml: xml['zeppelin.server.port'] = self.dist_config.port('zeppelin') xml['zeppelin.notebook.dir'] = self.dist_config.path( 'zeppelin_notebooks') etc_env = utils.read_etc_env() hadoop_conf_dir = etc_env.get('HADOOP_CONF_DIR', '/etc/hadoop/conf') spark_home = etc_env.get('SPARK_HOME', '/usr/lib/spark') spark_driver_mem = etc_env.get('SPARK_DRIVER_MEMORY', '1g') spark_exe_mode = os.environ.get('MASTER', 'yarn-client') spark_executor_mem = etc_env.get('SPARK_EXECUTOR_MEMORY', '1g') zeppelin_env = self.dist_config.path( 'zeppelin_conf') / 'zeppelin-env.sh' with open(zeppelin_env, "a") as f: f.write('export ZEPPELIN_HOME={}\n'.format( self.dist_config.path('zeppelin'))) f.write( 'export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n' .format(spark_driver_mem, spark_executor_mem)) f.write('export ZEPPELIN_LOG_DIR={}\n'.format( self.dist_config.path('zeppelin_logs'))) f.write( 'export ZEPPELIN_MEM="-Xms128m -Xmx1024m -XX:MaxPermSize=512m"\n' ) f.write('export ZEPPELIN_NOTEBOOK_DIR={}\n'.format( self.dist_config.path('zeppelin_notebooks'))) f.write('export SPARK_HOME={}\n'.format(spark_home)) f.write( 'export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n' .format(spark_driver_mem, spark_executor_mem)) f.write('export HADOOP_CONF_DIR={}\n'.format(hadoop_conf_dir)) f.write( 'export PYTHONPATH={s}/python:{s}/python/lib/py4j-0.8.2.1-src.zip\n' .format(s=spark_home)) f.write('export MASTER={}\n'.format(spark_exe_mode)) # User needs write access to zepp's conf to write interpreter.json # on server start. chown the whole conf dir, though we could probably # touch that file and chown it, leaving the rest owned as root:root. # TODO: weigh implications of have zepp's conf dir owned by non-root. cmd = "chown -R ubuntu:hadoop {}".format( self.dist_config.path('zeppelin_conf')) call(cmd.split())
def setup_etc_env(self): ''' Write some niceties to /etc/environment ''' # Configure system-wide bits zeppelin_bin = self.dist_config.path('zeppelin') / 'bin' zeppelin_conf = self.dist_config.path('zeppelin_conf') with utils.environment_edit_in_place('/etc/environment') as env: if zeppelin_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], zeppelin_bin]) env['ZEPPELIN_CONF_DIR'] = zeppelin_conf
def configure_yarn_mode(self): # put the spark jar in hdfs spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/share/lib') try: utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar, '/user/ubuntu/share/lib/spark-assembly.jar') except CalledProcessError: pass # jar already in HDFS from another Spark with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # create hdfs storage space for history server dc = self.dist_config prefix = dc.path('log_prefix') events_dir = dc.path('spark_events') events_dir = 'hdfs:///{}'.format(events_dir.replace(prefix, '')) utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', events_dir) utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', events_dir) # create hdfs storage space for spark-bench utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/spark-bench') utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', '/user/ubuntu/spark-bench') # ensure user-provided Hadoop works hadoop_classpath = utils.run_as('hdfs', 'hadoop', 'classpath', capture_output=True) spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh' utils.re_edit_in_place(spark_env, { r'.*SPARK_DIST_CLASSPATH.*': 'SPARK_DIST_CLASSPATH={}'.format(hadoop_classpath), }, append_non_matches=True) # update spark-defaults spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' etc_env = utils.read_etc_env() utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), }, append_non_matches=True) unitdata.kv().set('hdfs.available', True) unitdata.kv().flush(True)
def initial_pig_config(self): ''' Configure system-wide pig bits. ''' pig_bin = self.dist_config.path('pig') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if pig_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], pig_bin]) env['PIG_CONF_DIR'] = self.dist_config.path('pig_conf') env['PIG_HOME'] = self.dist_config.path('pig') env['HADOOP_CONF_DIR'] = self.dist_config.path('hadoop_conf')
def configure_flume(self): flume_bin = self.dist_config.path('flume') / 'bin' java_symlink = check_output(["readlink", "-f", "/usr/bin/java"]) java_home = re.sub('/bin/java', '', java_symlink).rstrip() java_cp = "{}".format(self.dist_config.path('flume') / 'lib') with utils.environment_edit_in_place('/etc/environment') as env: if flume_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], flume_bin]) env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf') env['FLUME_CLASSPATH'] = java_cp env['FLUME_HOME'] = self.dist_config.path('flume') env['JAVA_HOME'] = java_home
def set_oozie_env(self): oozie_bin = self.dist_config.path('oozie') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if oozie_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], oozie_bin]) env['OOZIE_HOME'] = self.dist_config.path('oozie') / 'libexec' env['OOZIE_CONFIG'] = self.dist_config.path('oozie_conf') env['OOZIE_DATA'] = self.dist_config.path('oozie_data') env['OOZIE_LOG'] = self.dist_config.path('oozie_log') env['CATALINA_BASE'] = self.dist_config.path('oozie') / 'oozie-server' env['CATALINA_TMPDIR'] = '/tmp' env['CATALINA_PID'] = '/tmp/oozie.pid'
def install_mahout(): hookenv.status_set('maintenance', 'installing mahout') bigtop = Bigtop() bigtop.render_site_yaml(roles=[ 'mahout-client', ], ) bigtop.trigger_puppet() with utils.environment_edit_in_place('/etc/environment') as env: env['MAHOUT_HOME'] = '/usr/lib/mahout' hookenv.status_set('active', 'ready') set_state('mahout.installed')
def setup_kafka_config(self): ''' copy the default configuration files to kafka_conf property defined in dist.yaml ''' default_conf = self.dist_config.path('kafka') / 'config' kafka_conf = self.dist_config.path('kafka_conf') kafka_conf.rmtree_p() default_conf.copytree(kafka_conf) # Now remove the conf included in the tarball and symlink our real conf # dir. we've seen issues where kafka still looks for config in # KAFKA_HOME/config. default_conf.rmtree_p() kafka_conf.symlink(default_conf) # Configure immutable bits kafka_bin = self.dist_config.path('kafka') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if kafka_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], kafka_bin]) env['LOG_DIR'] = self.dist_config.path('kafka_app_logs') # note: we set the advertised.host.name below to the public_address # to ensure that external (non-Juju) clients can connect to Kafka public_address = hookenv.unit_get('public-address') private_ip = utils.resolve_private_address( hookenv.unit_get('private-address')) kafka_server_conf = self.dist_config.path( 'kafka_conf') / 'server.properties' service, unit_num = os.environ['JUJU_UNIT_NAME'].split('/', 1) utils.re_edit_in_place( kafka_server_conf, { r'^broker.id=.*': 'broker.id=%s' % unit_num, r'^port=.*': 'port=%s' % self.dist_config.port('kafka'), r'^log.dirs=.*': 'log.dirs=%s' % self.dist_config.path('kafka_data_logs'), r'^#?advertised.host.name=.*': 'advertised.host.name=%s' % public_address, }) kafka_log4j = self.dist_config.path('kafka_conf') / 'log4j.properties' utils.re_edit_in_place( kafka_log4j, { r'^kafka.logs.dir=.*': 'kafka.logs.dir=%s' % self.dist_config.path('kafka_app_logs'), }) # fix for lxc containers and some corner cases in manual provider # ensure that public_address is resolvable internally by mapping it to the private IP utils.update_kv_host(private_ip, public_address) utils.manage_etc_hosts()
def setup_kafka_config(self): ''' copy the default configuration files to kafka_conf property defined in dist.yaml ''' default_conf = self.dist_config.path('kafka') / 'config' kafka_conf = self.dist_config.path('kafka_conf') kafka_conf.rmtree_p() default_conf.copytree(kafka_conf) # Now remove the conf included in the tarball and symlink our real conf # dir. we've seen issues where kafka still looks for config in # KAFKA_HOME/config. default_conf.rmtree_p() kafka_conf.symlink(default_conf) # Configure immutable bits kafka_bin = self.dist_config.path('kafka') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if kafka_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], kafka_bin]) env['LOG_DIR'] = self.dist_config.path('kafka_app_logs') # note: we set the advertised.host.name below to the public_address # to ensure that external (non-Juju) clients can connect to Kafka public_address = hookenv.unit_get('public-address') private_ip = utils.resolve_private_address(hookenv.unit_get('private-address')) kafka_server_conf = self.dist_config.path('kafka_conf') / 'server.properties' service, unit_num = os.environ['JUJU_UNIT_NAME'].split('/', 1) utils.re_edit_in_place(kafka_server_conf, { r'^broker.id=.*': 'broker.id=%s' % unit_num, r'^port=.*': 'port=%s' % self.dist_config.port('kafka'), r'^log.dirs=.*': 'log.dirs=%s' % self.dist_config.path('kafka_data_logs'), r'^#?advertised.host.name=.*': 'advertised.host.name=%s' % public_address, }) kafka_log4j = self.dist_config.path('kafka_conf') / 'log4j.properties' utils.re_edit_in_place(kafka_log4j, { r'^kafka.logs.dir=.*': 'kafka.logs.dir=%s' % self.dist_config.path('kafka_app_logs'), }) # fix for lxc containers and some corner cases in manual provider # ensure that public_address is resolvable internally by mapping it to the private IP utils.update_etc_hosts({private_ip: public_address}) templating.render( 'upstart.conf', '/etc/init/kafka.conf', context={ 'kafka_conf': self.dist_config.path('kafka_conf'), 'kafka_bin': '{}/bin'.format(self.dist_config.path('kafka')) }, )
def configure_notebook(self): # profile config created during install ipython_profile = "ipython_notebook_config.py" # find path to ipython_notebook_config.py pPath = "/home/ubuntu/.ipython/profile_pyspark" cmd = ['find', pPath, '-name', ipython_profile] profile_config = check_output(cmd, universal_newlines=True).strip() # update profile with standard opts and configured port port = self.dist_config.port('notebook') notebooks_dir = self.dist_config.path('notebooks') utils.re_edit_in_place(profile_config, { r'.*c.NotebookApp.ip *=.*': 'c.NotebookApp.ip = "*"', r'.*c.NotebookApp.open_browser *=.*': 'c.NotebookApp.open_browser = False', r'.*c.NotebookApp.port *=.*': 'c.NotebookApp.port = {}'.format(port), r'.*c.NotebookManager.notebook_dir *=.*': "c.NotebookManager.notebook_dir = u'{}'".format(notebooks_dir), }) spark_home = os.environ.get("SPARK_HOME", '/usr/lib/spark') py4j = "py4j-0.*.zip" cmd = "find {} -name {}".format(spark_home, py4j) # TODO: handle missing py4j py4j_path = check_output(cmd.split(), universal_newlines=True).strip() setup_source = 'scripts/00-pyspark-setup.py' Path(setup_source).chmod(0o755) Path(setup_source).chown('ubuntu', 'hadoop') utils.re_edit_in_place(setup_source, { r'py4j *=.*': 'py4j="{}"'.format(py4j_path), }) home = Path(os.environ.get('HOME', '/home/ubuntu')) profile_dir = home / '.ipython/profile_pyspark' setup_target = profile_dir / 'startup/00-pyspark-setup.py' Path(setup_source).copy2(setup_target) # Our spark charm defaults to yarn-client, so that should # be a safe default here in case MASTER isn't set. Update the env # with our spark mode and py4j location. spark_mode = os.environ.get("MASTER", "yarn-client") spark_home = Path(os.environ.get("SPARK_HOME", "/usr/lib/spark")) with utils.environment_edit_in_place('/etc/environment') as env: env['PYSPARK_DRIVER_PYTHON_OPTS'] = "notebook" env['PYSPARK_SUBMIT_ARGS'] = "--master " + spark_mode env['PYTHONPATH'] = spark_home / py4j_path
def disable_yarn_mode(self): # put the spark jar in hdfs with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] # update spark-defaults spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), }, append_non_matches=True) unitdata.kv().set('hdfs.available', False) unitdata.kv().flush(True)
def install_mahout(): hookenv.status_set('maintenance', 'installing mahout') bigtop = Bigtop() bigtop.render_site_yaml( roles=[ 'mahout-client', ], ) bigtop.trigger_puppet() with utils.environment_edit_in_place('/etc/environment') as env: env['MAHOUT_HOME'] = '/usr/lib/mahout' hookenv.status_set('active', 'ready') set_state('mahout.installed')
def configure_yarn_mode(self): # put the spark jar in hdfs spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/share/lib') try: utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar, '/user/ubuntu/share/lib/spark-assembly.jar') except CalledProcessError: pass # jar already in HDFS from another Spark with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # create hdfs storage space for history server dc = self.dist_config prefix = dc.path('log_prefix') events_dir = dc.path('spark_events') events_dir = 'hdfs:///{}'.format(events_dir.replace(prefix, '')) utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', events_dir) utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', events_dir) # create hdfs storage space for spark-bench utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/spark-bench') utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', '/user/ubuntu/spark-bench') # ensure user-provided Hadoop works hadoop_classpath = utils.run_as('hdfs', 'hadoop', 'classpath', capture_output=True) spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh' utils.re_edit_in_place(spark_env, { r'.*SPARK_DIST_CLASSPATH.*': 'SPARK_DIST_CLASSPATH={}'.format(hadoop_classpath), }, append_non_matches=True) # update spark-defaults spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf' etc_env = utils.read_etc_env() hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), r'.*spark.driver.extraClassPath .*': 'spark.driver.extraClassPath {}'.format(hadoop_extra_classpath), }, append_non_matches=True) unitdata.kv().set('hdfs.available', True) unitdata.kv().flush(True)
def superset_startup(): superset_dir = '/home/ubuntu/superset' db_uri = 'sqlite:////home/ubuntu/.superset/superset.db' context = {'db_uri': db_uri} host.mkdir(superset_dir) templating.render(source='superset_config.py.jinja2', target=superset_dir + '/superset_config.py', context=context) with utils.environment_edit_in_place('/etc/environment') as env: # Appending superset_config.py to the PYTHONPATH env['PYTHONPATH'] = "$PYTHONPATH:%s" % (superset_dir + '/superset_config.py') # Create an admin user (you will be prompted to set username, first and last name before setting a password) # Username [admin]: # User first name [admin]: # User last name [user]: # Email [[email protected]]: # Password: # Repeat for confirmation: hookenv.log('Creating admin user for Superset') child = pexpect.spawn( "su - ubuntu -c \"fabmanager create-admin --app superset\"") child.expect('\\r\\nUsername \[admin\]: ') child.sendline() child.expect('\\r\\nUser first name \[admin\]: ') child.sendline() child.expect('\\r\\nUser last name \[user\]: ') child.sendline() child.expect('\\r\\nEmail \[[email protected]\]: ') child.sendline() child.expect('\\r\\nPassword: '******'admin') child.expect('\\r\\nRepeat for confirmation: ') child.sendline('admin') # Create default roles and permissions hookenv.log('Create default roles and permissions') #subprocess.check_call(['superset', 'init']) subprocess.check_call(['su', '-', 'ubuntu', '-c', 'superset init']) # Start the web server on port 8088, use -p to bind to another port hookenv.log('Start the web server on port 8088') #subprocess.Popen(['superset', 'runserver']) subprocess.Popen(['su', '-', 'ubuntu', '-c', 'superset runserver']) set_state('superset.ready') status_set('active', 'Superset up and running')
def install_mahout(): hookenv.status_set('maintenance', 'installing mahout') bigtop = Bigtop() bigtop.render_site_yaml(roles=[ 'mahout-client', ], ) bigtop.trigger_puppet() with utils.environment_edit_in_place('/etc/environment') as env: env['MAHOUT_HOME'] = '/usr/lib/mahout' set_state('mahout.installed') hookenv.status_set('active', 'ready') # set app version string for juju status output mahout_version = get_package_version('mahout') or 'unknown' hookenv.application_version_set(mahout_version)
def configure_zeppelin(self): """ Configure zeppelin environment for all users """ zeppelin_bin = self.dist_config.path("zeppelin") / "bin" with utils.environment_edit_in_place("/etc/environment") as env: if zeppelin_bin not in env["PATH"]: env["PATH"] = ":".join([env["PATH"], zeppelin_bin]) env["ZEPPELIN_CONF_DIR"] = self.dist_config.path("zeppelin_conf") zeppelin_site = self.dist_config.path("zeppelin_conf") / "zeppelin-site.xml" with utils.xmlpropmap_edit_in_place(zeppelin_site) as xml: xml["zeppelin.server.port"] = self.dist_config.port("zeppelin") xml["zeppelin.notebook.dir"] = self.dist_config.path("zeppelin_notebooks") etc_env = utils.read_etc_env() hadoop_conf_dir = etc_env.get("HADOOP_CONF_DIR", "/etc/hadoop/conf") spark_home = etc_env.get("SPARK_HOME", "/usr/lib/spark") spark_driver_mem = etc_env.get("SPARK_DRIVER_MEMORY", "1g") spark_exe_mode = os.environ.get("MASTER", "yarn-client") spark_executor_mem = etc_env.get("SPARK_EXECUTOR_MEMORY", "1g") zeppelin_env = self.dist_config.path("zeppelin_conf") / "zeppelin-env.sh" with open(zeppelin_env, "a") as f: f.write("export ZEPPELIN_HOME={}\n".format(self.dist_config.path("zeppelin"))) f.write( 'export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n'.format( spark_driver_mem, spark_executor_mem ) ) f.write("export ZEPPELIN_LOG_DIR={}\n".format(self.dist_config.path("zeppelin_logs"))) f.write('export ZEPPELIN_MEM="-Xms128m -Xmx1024m -XX:MaxPermSize=512m"\n') f.write("export ZEPPELIN_NOTEBOOK_DIR={}\n".format(self.dist_config.path("zeppelin_notebooks"))) f.write("export SPARK_HOME={}\n".format(spark_home)) f.write( 'export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n'.format( spark_driver_mem, spark_executor_mem ) ) f.write("export HADOOP_CONF_DIR={}\n".format(hadoop_conf_dir)) f.write("export PYTHONPATH={s}/python:{s}/python/lib/py4j-0.8.2.1-src.zip\n".format(s=spark_home)) f.write("export MASTER={}\n".format(spark_exe_mode)) # User needs write access to zepp's conf to write interpreter.json # on server start. chown the whole conf dir, though we could probably # touch that file and chown it, leaving the rest owned as root:root. # TODO: weigh implications of have zepp's conf dir owned by non-root. cmd = "chown -R ubuntu:hadoop {}".format(self.dist_config.path("zeppelin_conf")) call(cmd.split())
def configure_yarn_mode(self): # put the spark jar in hdfs spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/share/lib') try: utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar, '/user/ubuntu/share/lib/spark-assembly.jar') except CalledProcessError: pass # jar already in HDFS from another Spark with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # create hdfs storage space for history server utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', self.dist_config.path('spark_events')) utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', self.dist_config.path('spark_events')) # create hdfs storage space for spark-bench utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/spark-bench') utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', '/user/ubuntu/spark-bench') # update spark-defaults spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' etc_env = utils.read_etc_env() hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), r'.*spark.eventLog.enabled .*': 'spark.eventLog.enabled true', r'.*spark.eventLog.dir .*': 'spark.eventLog.dir hdfs://{}'.format( self.dist_config.path('spark_events')), r'.*spark.driver.extraClassPath .*': 'spark.driver.extraClassPath {}'.format(hadoop_extra_classpath), }, append_non_matches=True) unitdata.kv().set('hdfs.available', True) unitdata.kv().flush(True)
def disable_yarn_mode(self): # put the spark jar in hdfs with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] # update spark-defaults spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), }, append_non_matches=True) unitdata.kv().set('hdfs.available', False) unitdata.kv().flush(True)
def disable_yarn_mode(self): # put the spark jar in hdfs with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] # update spark-defaults spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), r'.*spark.eventLog.enabled .*': 'spark.eventLog.enabled true', r'.*spark.eventLog.dir .*': '# spark.eventLog.dir hdfs:///user/ubuntu/directory', r'.*spark.driver.extraClassPath .*': '# spark.driver.extraClassPath none', }, append_non_matches=True) unitdata.kv().set('hdfs.available', False) unitdata.kv().flush(True)
def configure_zeppelin(self): ''' Configure zeppelin environment for all users ''' zeppelin_bin = self.dist_config.path('zeppelin') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if zeppelin_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], zeppelin_bin]) env['ZEPPELIN_CONF_DIR'] = self.dist_config.path('zeppelin_conf') zeppelin_site = self.dist_config.path('zeppelin_conf') / 'zeppelin-site.xml' with utils.xmlpropmap_edit_in_place(zeppelin_site) as xml: xml['zeppelin.server.port'] = self.dist_config.port('zeppelin') xml['zeppelin.notebook.dir'] = self.dist_config.path('zeppelin_notebooks') etc_env = utils.read_etc_env() hadoop_conf_dir = etc_env.get('HADOOP_CONF_DIR', '/etc/hadoop/conf') hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') spark_home = etc_env.get('SPARK_HOME', '/usr/lib/spark') spark_driver_mem = etc_env.get('SPARK_DRIVER_MEMORY', '1g') spark_exe_mode = os.environ.get('MASTER', 'yarn-client') spark_executor_mem = etc_env.get('SPARK_EXECUTOR_MEMORY', '1g') zeppelin_env = self.dist_config.path('zeppelin_conf') / 'zeppelin-env.sh' with open(zeppelin_env, "a") as f: f.write('export ZEPPELIN_CLASSPATH_OVERRIDES={}\n'.format(hadoop_extra_classpath)) f.write('export ZEPPELIN_HOME={}\n'.format(self.dist_config.path('zeppelin'))) f.write('export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n'.format( spark_driver_mem, spark_executor_mem)) f.write('export ZEPPELIN_LOG_DIR={}\n'.format(self.dist_config.path('zeppelin_logs'))) f.write('export ZEPPELIN_MEM="-Xms128m -Xmx1024m -XX:MaxPermSize=512m"\n') f.write('export ZEPPELIN_NOTEBOOK_DIR={}\n'.format(self.dist_config.path('zeppelin_notebooks'))) f.write('export SPARK_HOME={}\n'.format(spark_home)) f.write('export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n'.format( spark_driver_mem, spark_executor_mem)) f.write('export HADOOP_CONF_DIR={}\n'.format(hadoop_conf_dir)) f.write('export PYTHONPATH={s}/python:{s}/python/lib/py4j-0.8.2.1-src.zip\n'.format(s=spark_home)) f.write('export MASTER={}\n'.format(spark_exe_mode)) # User needs write access to zepp's conf to write interpreter.json # on server start. chown the whole conf dir, though we could probably # touch that file and chown it, leaving the rest owned as root:root. # TODO: weigh implications of have zepp's conf dir owned by non-root. cmd = "chown -R ubuntu:hadoop {}".format(self.dist_config.path('zeppelin_conf')) call(cmd.split())
def configure_flume_env(self, flume_hdfs_info_dict): config = hookenv.config() templating.render( source='flume.conf.j2', target=self.dist_config.path('flume_conf') / 'flume.conf', context={'dist_config': self.dist_config, 'config': config, 'flume_hdfs': flume_hdfs_info_dict}) flume_bin = self.dist_config.path('flume') / 'bin' java_symlink = check_output(["readlink", "-f", "/usr/bin/java"]).decode('utf8') java_home = re.sub('/bin/java', '', java_symlink).rstrip() with utils.environment_edit_in_place('/etc/environment') as env: if flume_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], flume_bin]) env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf') env['FLUME_CLASSPATH'] = self.dist_config.path('flume') / 'lib' env['FLUME_HOME'] = self.dist_config.path('flume') env['JAVA_HOME'] = java_home
def install_mahout(): hookenv.status_set('maintenance', 'installing mahout') bigtop = Bigtop() bigtop.render_site_yaml( roles=[ 'mahout-client', ], ) bigtop.trigger_puppet() with utils.environment_edit_in_place('/etc/environment') as env: env['MAHOUT_HOME'] = '/usr/lib/mahout' set_state('mahout.installed') hookenv.status_set('active', 'ready') # set app version string for juju status output mahout_version = get_package_version('mahout') or 'unknown' hookenv.application_version_set(mahout_version)
def install_giraph(giraph): """Install giraph when prerequisite states are present.""" hookenv.status_set('maintenance', 'installing giraph') bigtop = Bigtop() bigtop.render_site_yaml( roles=[ 'giraph-client', ], ) bigtop.trigger_puppet() # Put down the -doc subpackage so we get giraph-examples fetch.apt_install('giraph-doc') giraph_home = Path('/usr/lib/giraph') giraph_docdir = Path('/usr/share/doc/giraph') giraph_libdir = Path(giraph_home / 'lib') giraph_examples = glob('{}/giraph-examples-*.jar'.format(giraph_docdir)) # Gather a list of all the giraph jars (needed for -libjars) giraph_jars = giraph_examples giraph_jars.extend(get_good_jars(giraph_home, prefix=True)) giraph_jars.extend(get_good_jars(giraph_libdir, prefix=True)) # Update environment with appropriate giraph bits. HADOOP_CLASSPATH can # use wildcards (and it should for readability), but GIRAPH_JARS, which # is intended to be used as 'hadoop jar -libjars $GIRAPH_JARS', needs to # be a comma-separate list of jars. with utils.environment_edit_in_place('/etc/environment') as env: cur_cp = env['HADOOP_CLASSPATH'] if 'HADOOP_CLASSPATH' in env else "" env['GIRAPH_HOME'] = giraph_home env['HADOOP_CLASSPATH'] = "{examples}/*:{home}/*:{libs}/*:{cp}".format( examples=giraph_docdir, home=giraph_home, libs=giraph_libdir, cp=cur_cp ) env['GIRAPH_JARS'] = ','.join(j for j in giraph_jars) set_state('giraph.installed') report_status() # set app version string for juju status output giraph_version = get_package_version('giraph') or 'unknown' hookenv.application_version_set(giraph_version)
def configure_flume(self): config = hookenv.config() templating.render( source='flume.conf.j2', target=self.dist_config.path('flume_conf') / 'flume.conf', context={'dist_config': self.dist_config, 'config': config}) flume_bin = self.dist_config.path('flume') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if flume_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], flume_bin]) env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf') env['FLUME_CLASSPATH'] = self.dist_config.path('flume') / 'lib' env['FLUME_HOME'] = self.dist_config.path('flume') # flume_env = self.dist_config.path('flume_conf') / 'flume-env.sh' # utils.re_edit_in_place(flume_env, { # }) utils.run_as('flume', 'hdfs', 'dfs', '-mkdir', '-p', '/user/flume')
def setup_hue(self): hue_bin = self.dist_config.path('hue') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if hue_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], hue_bin]) env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin' env['GOBBLIN_WORK_DIR'] = self.dist_config.path('outputdir') hadoop_conf = env['HADOOP_CONF_DIR'] + '/core-site.xml' yarn_conf = env['HADOOP_CONF_DIR'] + '/yarn-site.xml' mapred_conf = env['HADOOP_CONF_DIR'] + '/mapred-site.xml' with utils.xmlpropmap_edit_in_place(hadoop_conf) as props: hdfs_endpoint = props['fs.defaultFS'] with utils.xmlpropmap_edit_in_place(yarn_conf) as props: yarn_log_url = props['yarn.log.server.url'] # 19888 yarn_resmgr = props['yarn.resourcemanager.address'] # 8032 with utils.xmlpropmap_edit_in_place(mapred_conf) as props: mapred_jobhistory = props['mapreduce.jobhistory.address'] # 10020 default_conf = self.dist_config.path('hue') / 'desktop/conf' hue_conf = self.dist_config.path('hue_conf') hue_conf.rmtree_p() default_conf.copytree(hue_conf) # Now remove the conf included in the tarball and symlink our real conf default_conf.rmtree_p() hue_conf.symlink(default_conf) hdfs_fulluri = hdfs_endpoint.split('/')[2] hdfs_hostname = hdfs_fulluri.split(':')[0] hue_config = ''.join((self.dist_config.path('hue'), '/desktop/conf/hue.ini')) hue_port = self.dist_config.port('hue_web') utils.re_edit_in_place(hue_config, { r'http_port=8888': 'http_port=%s' % hue_port, r'fs_defaultfs=hdfs://localhost:8020': 'fs_defaults=%s' % hdfs_endpoint, r'## resourcemanager_host=localhost': 'resourcemanager_host=%s' % yarn_resmgr.split(':')[0], r'## resourcemanager_port=8032': 'resourcemanager_port=%s' % yarn_resmgr.split(':')[1], r'## webhdfs_url=http://localhost:50070/webhdfs/v1': 'webhdfs_url=http://%s:50070/webhdfs/v1' % hdfs_hostname, r'## history_server_api_url=http://localhost:19888': 'history_server_api_url=%s' % yarn_log_url.split('/')[0], r'## resourcemanager_api_url=http://localhost:8088': 'resourcemanager_api_url=http://%s:8088' % yarn_resmgr.split(':')[0] })
def setup_hue(self, namenodes, resourcemanagers, hdfs_port, yarn_port): hookenv.status_set('maintenance', 'Setting up Hue') hue_bin = self.dist_config.path('hue') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if hue_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], hue_bin]) env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin' env['GOBBLIN_WORK_DIR'] = self.dist_config.path('outputdir') yarn_conf = env['HADOOP_CONF_DIR'] + '/yarn-site.xml' with utils.xmlpropmap_edit_in_place(yarn_conf) as props: yarn_log_url = props['yarn.log.server.url'] # 19888 yarn_resmgr = props['yarn.resourcemanager.address'] # 8032 default_conf = self.dist_config.path('hue') / 'desktop/conf' hue_conf = self.dist_config.path('hue_conf') if os.path.islink('/usr/lib/hue/desktop/conf'): return else: hue_conf.rmtree_p() default_conf.copytree(hue_conf) # Now remove the conf included in the tarball and symlink our real conf default_conf.rmtree_p() hue_conf.symlink(default_conf) hue_port = self.dist_config.port('hue_web') # Fix following for HA: http://docs.hortonworks.com/HDPDocuments/HDP2/HDP-2.3.0/bk_hadoop-ha/content/ha-nn-deploy-hue.html hookenv.log("Not currently supporting HA, FIX: namenodes are: " + str(namenodes) + " resmanagers: " + str(resourcemanagers)) utils.re_edit_in_place(self.hue_config, { r'http_port=8888': 'http_port={}' % hue_port, r'fs_defaultfs=hdfs://localhost:8020': 'fs_defaultfs={}:{}'.format(namenodes[0], hdfs_port), r'.*resourcemanager_host=localhost': 'resourcemanager_host={}'.format(resourcemanagers[0]), r'.*resourcemanager_port=8032': 'resourcemanager_port={}'.format(yarn_port), r'.*webhdfs_url=http://localhost:50070/webhdfs/v1': 'webhdfs_url=http://{}:50070/webhdfs/v1'.format(namenodes[0]), r'.*history_server_api_url=http://localhost:19888': 'history_server_api_url={}'.format(yarn_log_url.split('/')[0]), r'.*resourcemanager_api_url=http://localhost:8088': 'resourcemanager_api_url=http://{}:8088'.format(yarn_resmgr.split(':')[0]), r'.*secret_key=.*': 'secret_key={}'.format(uuid.uuid4()) }) self.update_apps()
def configure_livy(self): """ Configure livy environment for all users """ livy_bin = self.dist_config.path("livy") / "bin" with utils.environment_edit_in_place("/etc/environment") as env: if livy_bin not in env["PATH"]: env["PATH"] = ":".join([env["PATH"], livy_bin]) hadoop_cp = "/etc/hadoop/conf:/usr/lib/hadoop/share/hadoop/common/lib/*:/usr/lib/hadoop/share/hadoop/common/*\ :/usr/lib/hadoop/share/hadoop/hdfs:/usr/lib/hadoop/share/hadoop/hdfs/lib/*\ :/usr/lib/hadoop/share/hadoop/hdfs/*:/usr/lib/hadoop/share/hadoop/yarn/lib/*\ :/usr/lib/hadoop/share/hadoop/yarn/*:/usr/lib/hadoop/share/hadoop/mapreduce/lib/*\ :/usr/lib/hadoop/share/hadoop/mapreduce/*:/usr/lib/hadoop/contrib/capacity-scheduler/*.jar" env["CLASSPATH"] = hadoop_cp cmd = "chown -R hue:hadoop {}".format(self.dist_config.path("livy")) call(cmd.split()) cmd = "chown -R hue:hadoop {}".format(self.dist_config.path("livy_conf")) call(cmd.split())
def configure_hadoop(self): java_home = Path(unitdata.kv().get('java.home')) java_bin = java_home / 'bin' hadoop_home = self.dist_config.path('hadoop') hadoop_bin = hadoop_home / 'bin' hadoop_sbin = hadoop_home / 'sbin' # If we have hadoop-addons (like lzo), set those in the environment hadoop_extra_classpath = [] if 'lzo' in self.resources: hadoop_extra_classpath.extend( hadoop_home.walkfiles('hadoop-lzo-*.jar')) with utils.environment_edit_in_place('/etc/environment') as env: env['JAVA_HOME'] = java_home if java_bin not in env['PATH']: env['PATH'] = ':'.join([java_bin, env['PATH'] ]) # ensure that correct java is used if hadoop_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], hadoop_bin]) if hadoop_sbin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], hadoop_sbin]) if hadoop_extra_classpath: env['HADOOP_EXTRA_CLASSPATH'] = ':'.join( hadoop_extra_classpath) env['HADOOP_LIBEXEC_DIR'] = hadoop_home / 'libexec' env['HADOOP_INSTALL'] = hadoop_home env['HADOOP_HOME'] = hadoop_home env['HADOOP_COMMON_HOME'] = hadoop_home env['HADOOP_HDFS_HOME'] = hadoop_home env['HADOOP_MAPRED_HOME'] = hadoop_home env['HADOOP_MAPRED_LOG_DIR'] = self.dist_config.path( 'mapred_log_dir') env['HADOOP_YARN_HOME'] = hadoop_home env['HADOOP_CONF_DIR'] = self.dist_config.path('hadoop_conf') env['YARN_LOG_DIR'] = self.dist_config.path('yarn_log_dir') env['HADOOP_LOG_DIR'] = self.dist_config.path('hdfs_log_dir') hadoop_env = self.dist_config.path('hadoop_conf') / 'hadoop-env.sh' utils.re_edit_in_place( hadoop_env, { r'export JAVA_HOME *=.*': 'export JAVA_HOME=%s' % java_home, })
def install_giraph(giraph): """Install giraph when prerequisite states are present.""" hookenv.status_set('maintenance', 'installing giraph') bigtop = Bigtop() bigtop.render_site_yaml(roles=[ 'giraph-client', ], ) bigtop.trigger_puppet() # Put down the -doc subpackage so we get giraph-examples fetch.apt_install('giraph-doc') giraph_home = Path('/usr/lib/giraph') giraph_docdir = Path('/usr/share/doc/giraph') giraph_libdir = Path(giraph_home / 'lib') giraph_examples = glob('{}/giraph-examples-*.jar'.format(giraph_docdir)) # Gather a list of all the giraph jars (needed for -libjars) giraph_jars = giraph_examples giraph_jars.extend(get_good_jars(giraph_home, prefix=True)) giraph_jars.extend(get_good_jars(giraph_libdir, prefix=True)) # Update environment with appropriate giraph bits. HADOOP_CLASSPATH can # use wildcards (and it should for readability), but GIRAPH_JARS, which # is intended to be used as 'hadoop jar -libjars $GIRAPH_JARS', needs to # be a comma-separate list of jars. with utils.environment_edit_in_place('/etc/environment') as env: cur_cp = env['HADOOP_CLASSPATH'] if 'HADOOP_CLASSPATH' in env else "" env['GIRAPH_HOME'] = giraph_home env['HADOOP_CLASSPATH'] = "{examples}/*:{home}/*:{libs}/*:{cp}".format( examples=giraph_docdir, home=giraph_home, libs=giraph_libdir, cp=cur_cp) env['GIRAPH_JARS'] = ','.join(j for j in giraph_jars) set_state('giraph.installed') report_status() # set app version string for juju status output giraph_version = get_package_version('giraph') or 'unknown' hookenv.application_version_set(giraph_version)
def setup_pig(self): ''' copy the default configuration files to pig_conf property defined in dist.yaml ''' default_conf = self.dist_config.path('pig') / 'conf' pig_conf = self.dist_config.path('pig_conf') pig_conf.rmtree_p() default_conf.copytree(pig_conf) # Now remove the conf included in the tarball and symlink our real conf default_conf.rmtree_p() pig_conf.symlink(default_conf) # Configure immutable bits pig_bin = self.dist_config.path('pig') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if pig_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], pig_bin]) env['PIG_CLASSPATH'] = env['HADOOP_CONF_DIR'] env['PIG_CONF_DIR'] = self.dist_config.path('pig_conf') env['PIG_HOME'] = self.dist_config.path('pig')
def installoracle(): hookenv.log('Installing Oracle JDK') java_major = '8' java_minor = '73' tarname = 'server-jre-{}u{}-linux-x64.tar.gz'.format( java_major, java_minor) dirname = 'jdk1.{}.0_{}'.format(java_major, java_minor) destdir = "/opt/java/{}".format(dirname) if not os.path.isdir(destdir): tfile = tarfile.open('{}/files/{}'.format(charm_dir(), tarname), 'r') # Important to note that the following extraction is # UNSAFE since .tar.gz archive could contain # relative path like ../../ and overwrite other dirs filesdir = '{}/files/'.format(charm_dir()) extractdir = '{}/{}'.format(filesdir, dirname) tfile.extractall(filesdir) mergecopytree(extractdir, destdir) # Set defaults subprocess.check_output([ 'update-alternatives', '--install', '/usr/bin/java', 'java', '{}/jre/bin/java'.format(destdir), '2000' ]) subprocess.check_output([ 'update-alternatives', '--install', '/usr/bin/javac', 'javac', '{}/bin/javac'.format(destdir), '2000' ]) # set env vars with utils.environment_edit_in_place('/etc/environment') as env: # ensure that correct java is used env['JAVA_HOME'] = destdir env['J2SDKDIR'] = destdir env['J2REDIR'] = '{}/jre'.format(destdir) env['DERBY_HOME'] = '{}/db'.format(destdir) if destdir not in env['PATH']: env['PATH'] = ':'.join([ '{}/bin'.format(env['JAVA_HOME']), '{}/bin'.format(env['J2REDIR']), '{}/bin'.format(env['DERBY_HOME']), env['PATH'], ])
def initial_config(self): """Do one-time Pig configuration. Copy the default configuration files to the pig_conf dir from dist.yaml and adjust system environment. """ default_conf = self.dist_config.path('pig') / 'conf' pig_conf = self.dist_config.path('pig_conf') pig_conf.rmtree_p() default_conf.copytree(pig_conf) # Now remove the conf included in the tarball and symlink our real conf default_conf.rmtree_p() pig_conf.symlink(default_conf) # Configure immutable bits pig_bin = self.dist_config.path('pig') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if pig_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], pig_bin]) env['PIG_CONF_DIR'] = self.dist_config.path('pig_conf') env['PIG_HOME'] = self.dist_config.path('pig') env['JAVA_HOME'] = Path(unitdata.kv().get('java.home'))
def install_go(): version = hookenv.config().get('version') if not version: status_set('blocked', 'Provide a Go version') return try: request = requests.get(version) if not request.status_code == 200: return file_path = '/tmp/' + version.split('/')[-1] with open(file_path, 'wb') as f: f.write(request.content) except requests.exceptions.RequestException as e: hookenv.log(e) return tar = tarfile.open(file_path, 'r:gz') tar.extractall('/tmp') tar.close() if not os.path.exists('/home/ubuntu/go'): shutil.move('/tmp/go', '/home/ubuntu') os.makedirs('/home/ubuntu/code/go/bin') chown_recursive('/home/ubuntu/go', 'ubuntu', 'ubuntu') chown_recursive('/home/ubuntu/code', 'ubuntu', 'ubuntu') with utils.environment_edit_in_place('/etc/environment') as env: env['GOROOT'] = '/home/ubuntu/go' env['GOPATH'] = '/home/ubuntu/code/go' env['PATH'] = env[ 'PATH'] + ':/home/ubuntu/go/bin:/home/ubuntu/code/go/bin' # Install package manager r = requests.get( 'https://raw.githubusercontent.com/pote/gpm/v1.4.0/bin/gpm') with open('/usr/local/bin/gpm', 'wb') as f: f.write(r.content) os.chmod("/usr/local/bin/gpm", 0o755) status_set('active', 'go installed') set_state('go.installed')
def setup_flink(self): ''' copy the default configuration files to flink_conf property defined in dist.yaml ''' default_conf = self.dist_config.path('flink') / 'conf' flink_conf = self.dist_config.path('flink_conf') if os.path.islink(default_conf): return flink_conf.rmtree_p() default_conf.copytree(flink_conf) # Now remove the conf included in the tarball and symlink our real conf default_conf.rmtree_p() flink_conf.symlink(default_conf) # Configure immutable bits flink_bin = self.dist_config.path('flink') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if flink_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], flink_bin]) env['FLINK_CLASSPATH'] = env['HADOOP_CONF_DIR'] env['FLINK_CONF_DIR'] = self.dist_config.path('flink_conf') env['FLINK_HOME'] = self.dist_config.path('flink')
def disable_yarn_mode(self): # put the spark jar in hdfs with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] # update spark-defaults spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), r'.*spark.eventLog.enabled .*': 'spark.eventLog.enabled true', r'.*spark.eventLog.dir .*': '# spark.eventLog.dir hdfs:///user/ubuntu/directory', r'.*spark.driver.extraClassPath .*': '# spark.driver.extraClassPath none', }, append_non_matches=True) unitdata.kv().set('hdfs.available', False) unitdata.kv().flush(True)
def configure_tomcat(): '''Configures Tomcat by setting environment variable and adding a user.''' status_set('maintenance', 'Configuring Tomcat...') # Set environment variable CATALINA_HOME. with utils.environment_edit_in_place('/etc/environment') as env: env['CATALINA_HOME'] = TOMCAT_DIR # Create a file where the process id of Tomcat can be stored. This makes # it possible to check if Tomcat is running. with open(TOMCAT_DIR + "/bin/setenv.sh", "a+") as setenv: setenv.write('CATALINA_PID="$CATALINA_BASE/bin/catalina.pid"') # Creates an admin user that has access to the manager-gui. admin_username = config()["admin_username"] admin_password = config()["admin_password"] context = { 'admin_username': admin_username, 'admin_password': admin_password } render('tomcat-users.xml', TOMCAT_DIR + '/conf/tomcat-users.xml', context) set_state('layer-tomcat.configured')
def configure(self, available_hosts, zk_units, peers): """ This is the core logic of setting up spark. Two flags are needed: * Namenode exists aka HDFS is ready * Resource manager exists aka YARN is ready both flags are infered from the available hosts. :param dict available_hosts: Hosts that Spark should know about. """ # Bootstrap spark if not unitdata.kv().get('spark.bootstrapped', False): self.setup() unitdata.kv().set('spark.bootstrapped', True) # Set KV based on connected applications unitdata.kv().set('zookeeper.units', zk_units) unitdata.kv().set('sparkpeer.units', peers) unitdata.kv().flush(True) # Get our config ready dc = self.dist_config events_log_dir = 'file://{}'.format(dc.path('spark_events')) mode = hookenv.config()['spark_execution_mode'] master_ip = utils.resolve_private_address(available_hosts['spark-master']) master_url = self.get_master_url(master_ip) # Setup hosts dict hosts = { 'spark': master_ip, } if 'namenode' in available_hosts: hosts['namenode'] = available_hosts['namenode'] events_log_dir = self.setup_hdfs_logs() if 'resourcemanager' in available_hosts: hosts['resourcemanager'] = available_hosts['resourcemanager'] # Setup roles dict. We always include the history server and client. # Determine other roles based on our execution mode. roles = ['spark-history-server', 'spark-client'] if mode == 'standalone': roles.append('spark-master') roles.append('spark-worker') elif mode.startswith('yarn'): roles.append('spark-on-yarn') roles.append('spark-yarn-slave') # Setup overrides dict override = { 'spark::common::master_url': master_url, 'spark::common::event_log_dir': events_log_dir, 'spark::common::history_log_dir': events_log_dir, } if zk_units: zks = [] for unit in zk_units: ip = utils.resolve_private_address(unit['host']) zks.append("%s:%s" % (ip, unit['port'])) zk_connect = ",".join(zks) override['spark::common::zookeeper_connection_string'] = zk_connect else: override['spark::common::zookeeper_connection_string'] = None # Create our site.yaml and trigger puppet bigtop = Bigtop() bigtop.render_site_yaml(hosts, roles, override) bigtop.trigger_puppet() # Do this after our puppet bits in case puppet overrides needed perms if 'namenode' not in available_hosts: # Local event dir (not in HDFS) needs to be 777 so non-spark # users can write job history there. It needs to be g+s so # all entries will be readable by spark (in the spark group). # It needs to be +t so users cannot remove files they don't own. dc.path('spark_events').chmod(0o3777) self.patch_worker_master_url(master_ip, master_url) # handle tuning options that may be set as percentages driver_mem = '1g' req_driver_mem = hookenv.config()['driver_memory'] executor_mem = '1g' req_executor_mem = hookenv.config()['executor_memory'] if req_driver_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("driver_memory percentage in non-local mode. Using 1g default.", level=None) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("executor_memory percentage in non-local mode. Using 1g default.", level=None) else: executor_mem = req_executor_mem spark_env = '/etc/spark/conf/spark-env.sh' utils.re_edit_in_place(spark_env, { r'.*SPARK_DRIVER_MEMORY.*': 'export SPARK_DRIVER_MEMORY={}'.format(driver_mem), r'.*SPARK_EXECUTOR_MEMORY.*': 'export SPARK_EXECUTOR_MEMORY={}'.format(executor_mem), }, append_non_matches=True) # Install SB (subsequent calls will reconfigure existing install) # SparkBench looks for the spark master in /etc/environment with utils.environment_edit_in_place('/etc/environment') as env: env['MASTER'] = master_url self.install_benchmark()
def configure(self, available_hosts, zk_units, peers, extra_libs): """ This is the core logic of setting up spark. :param dict available_hosts: Hosts that Spark should know about. :param list zk_units: List of Zookeeper dicts with host/port info. :param list peers: List of Spark peer tuples (unit name, IP). :param list extra_libs: List of extra lib paths for driver/executors. """ # Set KV based on connected applications unitdata.kv().set('zookeeper.units', zk_units) unitdata.kv().set('sparkpeer.units', peers) unitdata.kv().flush(True) # Get our config ready dc = self.dist_config mode = hookenv.config()['spark_execution_mode'] master_ip = utils.resolve_private_address( available_hosts['spark-master']) master_url = self.get_master_url(master_ip) req_driver_mem = hookenv.config()['driver_memory'] req_executor_mem = hookenv.config()['executor_memory'] if mode.startswith('yarn'): spark_events = 'hdfs://{}'.format(dc.path('spark_events')) else: spark_events = 'file://{}'.format(dc.path('spark_events')) # handle tuning options that may be set as percentages driver_mem = '1g' executor_mem = '1g' if req_driver_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "driver_memory percentage in non-local mode. " "Using 1g default.", level=hookenv.WARNING) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "executor_memory percentage in non-local mode. " "Using 1g default.", level=hookenv.WARNING) else: executor_mem = req_executor_mem # Some spark applications look for envars in /etc/environment with utils.environment_edit_in_place('/etc/environment') as env: env['MASTER'] = master_url env['SPARK_HOME'] = dc.path('spark_home') # Setup hosts dict hosts = { 'spark': master_ip, } if 'namenode' in available_hosts: hosts['namenode'] = available_hosts['namenode'] if 'resourcemanager' in available_hosts: hosts['resourcemanager'] = available_hosts['resourcemanager'] # Setup roles dict. We always include the history server and client. # Determine other roles based on our execution mode. roles = ['spark-history-server', 'spark-client'] if mode == 'standalone': roles.append('spark-master') roles.append('spark-worker') elif mode.startswith('yarn'): roles.append('spark-on-yarn') roles.append('spark-yarn-slave') # Setup overrides dict override = { 'spark::common::master_url': master_url, 'spark::common::event_log_dir': spark_events, 'spark::common::history_log_dir': spark_events, 'spark::common::extra_lib_dirs': ':'.join(extra_libs) if extra_libs else None, 'spark::common::driver_mem': driver_mem, 'spark::common::executor_mem': executor_mem, } if zk_units: zks = [] for unit in zk_units: ip = utils.resolve_private_address(unit['host']) zks.append("%s:%s" % (ip, unit['port'])) zk_connect = ",".join(zks) override['spark::common::zookeeper_connection_string'] = zk_connect else: override['spark::common::zookeeper_connection_string'] = None # Create our site.yaml and trigger puppet. # NB: during an upgrade, we configure the site.yaml, but do not # trigger puppet. The user must do that with the 'reinstall' action. bigtop = Bigtop() bigtop.render_site_yaml(hosts, roles, override) if unitdata.kv().get('spark.version.repo', False): hookenv.log( "An upgrade is available and the site.yaml has been " "configured. Run the 'reinstall' action to continue.", level=hookenv.INFO) else: bigtop.trigger_puppet() self.patch_worker_master_url(master_ip, master_url) # Packages don't create the event dir by default. Do it each time # spark is (re)installed to ensure location/perms are correct. self.configure_events_dir(mode) # Handle examples and Spark-Bench. Do this each time this method is # called in case we need to act on a new resource or user config. self.configure_examples() self.configure_sparkbench()
def setup_kafka_config(self): ''' copy the default configuration files to kafka_conf property defined in dist.yaml ''' default_conf = self.dist_config.path('kafka') / 'config' kafka_conf = self.dist_config.path('kafka_conf') kafka_conf.rmtree_p() default_conf.copytree(kafka_conf) # Now remove the conf included in the tarball and symlink our real conf # dir. we've seen issues where kafka still looks for config in # KAFKA_HOME/config. default_conf.rmtree_p() kafka_conf.symlink(default_conf) # Similarly, we've seen issues where kafka wants to write to # KAFKA_HOME/logs regardless of the LOG_DIR, so make a symlink. default_logs = self.dist_config.path('kafka') / 'logs' kafka_logs = self.dist_config.path('kafka_app_logs') default_logs.rmtree_p() kafka_logs.symlink(default_logs) # Configure environment kafka_bin = self.dist_config.path('kafka') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if kafka_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], kafka_bin]) env['LOG_DIR'] = self.dist_config.path('kafka_app_logs') # Configure server.properties # NB: We set the advertised.host.name below to our short hostname # instead of our private ip so external (non-Juju) clients can connect # to kafka (admin will still have to expose kafka and ensure the # external client can resolve the short hostname to our public ip). short_host = get_ip_for_interface(hookenv.config(network_interface)) if not short_host: short_host = hookenv.config().get('hostname') if not short_host: short_host = check_output(['hostname', '-s']).decode('utf8').strip() kafka_port = self.dist_config.port('kafka') kafka_server_conf = self.dist_config.path( 'kafka_conf') / 'server.properties' service, unit_num = os.environ['JUJU_UNIT_NAME'].split('/', 1) utils.re_edit_in_place( kafka_server_conf, { r'^broker.id=.*': 'broker.id=%s' % unit_num, r'^port=.*': 'port=%s' % kafka_port, r'^log.dirs=.*': 'log.dirs=%s' % self.dist_config.path('kafka_data_logs'), r'^#?advertised.host.name=.*': 'advertised.host.name=%s' % short_host, }) # Configure producer.properties # note: we set the broker list to whatever we advertise our broker to # be (advertised.host.name from above, which is our short hostname). kafka_producer_conf = self.dist_config.path( 'kafka_conf') / 'producer.properties' utils.re_edit_in_place( kafka_producer_conf, { r'^#?metadata.broker.list=.*': 'metadata.broker.list=%s:%s' % (short_host, kafka_port), }) # Configure log properties kafka_log4j = self.dist_config.path('kafka_conf') / 'log4j.properties' utils.re_edit_in_place( kafka_log4j, { r'^kafka.logs.dir=.*': 'kafka.logs.dir=%s' % self.dist_config.path('kafka_app_logs'), }) # Configure init script template_name = 'upstart.conf' template_path = '/etc/init/kafka.conf' if host.init_is_systemd(): template_name = 'systemd.conf' template_path = '/etc/systemd/system/kafka.service' templating.render( template_name, template_path, context={ 'kafka_conf': self.dist_config.path('kafka_conf'), 'kafka_bin': '{}/bin'.format(self.dist_config.path('kafka')) }, )
def configure(self): ''' Configure spark environment for all users ''' spark_home = self.dist_config.path('spark') spark_bin = spark_home / 'bin' # handle tuning options that may be set as percentages driver_mem = '1g' req_driver_mem = hookenv.config()['driver_memory'] executor_mem = '1g' req_executor_mem = hookenv.config()['executor_memory'] if req_driver_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "driver_memory percentage in non-local mode. Using 1g default.", level=None) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "executor_memory percentage in non-local mode. Using 1g default.", level=None) else: executor_mem = req_executor_mem # update environment variables with utils.environment_edit_in_place('/etc/environment') as env: if spark_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], spark_bin]) env['MASTER'] = self.get_master() env['PYSPARK_DRIVER_PYTHON'] = "ipython" env['SPARK_CONF_DIR'] = self.dist_config.path('spark_conf') env['SPARK_DRIVER_MEMORY'] = driver_mem env['SPARK_EXECUTOR_MEMORY'] = executor_mem env['SPARK_HOME'] = spark_home env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # update spark config spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place( spark_conf, { r'.*spark.master *.*': 'spark.master {}'.format(self.get_master()), r'.*spark.eventLog.enabled *.*': 'spark.eventLog.enabled true', r'.*spark.eventLog.dir *.*': 'spark.eventLog.dir hdfs:///user/ubuntu/directory', }) spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh' local_ip = utils.resolve_private_address(hookenv.unit_private_ip()) utils.re_edit_in_place( spark_env, { r'.*SPARK_DRIVER_MEMORY.*': 'SPARK_DRIVER_MEMORY={}'.format(driver_mem), r'.*SPARK_EXECUTOR_MEMORY.*': 'SPARK_EXECUTOR_MEMORY={}'.format(executor_mem), r'.*SPARK_LOG_DIR.*': 'SPARK_LOG_DIR={}'.format(self.dist_config.path('spark_logs')), r'.*SPARK_MASTER_IP.*': 'SPARK_MASTER_IP={}'.format(local_ip), r'.*SPARK_WORKER_DIR.*': 'SPARK_WORKER_DIR={}'.format( self.dist_config.path('spark_work')), }) # manage SparkBench install_sb = hookenv.config()['spark_bench_enabled'] sb_dir = '/home/ubuntu/spark-bench' if install_sb: if utils.cpu_arch() == 'ppc64le': sb_url = hookenv.config()['spark_bench_ppc64le'] else: # TODO: may need more arch cases (go with x86 sb for now) sb_url = hookenv.config()['spark_bench_x86_64'] Path(sb_dir).rmtree_p() fetcher = ArchiveUrlFetchHandler() fetcher.install(sb_url, '/home/ubuntu') # ##### # Handle glob if we use a .tgz that doesn't expand to sb_dir # sb_archive_dir = glob('/home/ubuntu/spark-bench-*')[0] # SparkBench expects to live in ~/spark-bench, so put it there # Path(sb_archive_dir).rename(sb_dir) # ##### # comment out mem tunings (let them come from /etc/environment) sb_env = Path(sb_dir) / 'conf/env.sh' utils.re_edit_in_place( sb_env, { r'^SPARK_DRIVER_MEMORY.*': '# SPARK_DRIVER_MEMORY (use value from environment)', r'^SPARK_EXECUTOR_MEMORY.*': '# SPARK_EXECUTOR_MEMORY (use value from environment)', }) else: Path(sb_dir).rmtree_p()