Пример #1
0
    def setup_gobblin(self, host, port):
        '''
        Configure Gobblin. Each time something changes (eg) a new Haddop endpoint is present this method must be called.

        :param str ip: IP of the HDFS endpoint.
        :param str port: Port of the HDFS endpoint.
        '''

        # Setup the environment
        gobblin_bin = self.dist_config.path('gobblin') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if gobblin_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], gobblin_bin])
            env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin'
            env['GOBBLIN_WORK_DIR'] = "/user/gobblin/work"

        hdfs_endpoint = ''.join([host, ':', port])

        # Setup gobblin configuration
        conf_dir = self.dist_config.path('gobblin') / 'conf'
        gobblin_config_template = conf_dir / 'gobblin-mapreduce.properties.template'
        gobblin_config = conf_dir / 'gobblin-mapreduce.properties'
        try:
            copy(gobblin_config_template, gobblin_config)
        except FileNotFoundError:
            pass

        utils.re_edit_in_place(gobblin_config, {
            r'fs.uri=hdfs://localhost:8020': 'fs.uri=hdfs://%s' % hdfs_endpoint,
        })

        if '2.7.2' in self.hadoop_version:
            utils.re_edit_in_place(gobblin_config, {
                r'task.data.root.dir=*': 'task.data.root.dir=${env:GOBBLIN_WORK_DIR}/task'
            }, append_non_matches=True)
Пример #2
0
    def setup_zookeeper_config(self):
        """Setup Zookeeper configuration based on default config.

        Copy the default configuration files to zookeeper_conf property
        defined in dist.yaml
        """
        default_conf = self.dist_config.path('zookeeper') / 'conf'
        zookeeper_conf = self.dist_config.path('zookeeper_conf')
        zookeeper_conf.rmtree_p()
        default_conf.copytree(zookeeper_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        default_conf.rmtree_p()
        zookeeper_conf.symlink(default_conf)

        zoo_cfg = zookeeper_conf / 'zoo.cfg'
        if not zoo_cfg.exists():
            (zookeeper_conf / 'zoo_sample.cfg').copy(zoo_cfg)
        utils.re_edit_in_place(zoo_cfg, {
            r'^dataDir.*': 'dataDir={}'.format(self.dist_config.path('zookeeper_data_dir')),
        })

        # Configure zookeeper environment for all users
        zookeeper_bin = self.dist_config.path('zookeeper') / 'bin'
        zookeeper_rest = self.dist_config.path('zookeeper') / 'src/contrib/rest'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if zookeeper_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], zookeeper_bin])
            env['ZOOCFGDIR'] = self.dist_config.path('zookeeper_conf')
            env['ZOO_BIN_DIR'] = zookeeper_bin
            env['ZOO_LOG_DIR'] = self.dist_config.path('zookeeper_log_dir')
            env['ZOO_REST_DIR'] = zookeeper_rest
Пример #3
0
    def setup_gobblin(self, host, port):
        """
        Configure Gobblin. Each time something changes (eg) a new Haddop endpoint is present this method must be called.

        :param str ip: IP of the HDFS endpoint.
        :param str port: Port of the HDFS endpoint.
        """

        # Setup the environment
        gobblin_bin = self.dist_config.path("gobblin") / "bin"
        with utils.environment_edit_in_place("/etc/environment") as env:
            if gobblin_bin not in env["PATH"]:
                env["PATH"] = ":".join([env["PATH"], gobblin_bin])
            env["HADOOP_BIN_DIR"] = env["HADOOP_HOME"] + "/bin"
            env["GOBBLIN_WORK_DIR"] = "/user/gobblin/work"

        hdfs_endpoint = "".join([host, ":", port])

        # Setup gobblin configuration
        conf_dir = self.dist_config.path("gobblin") / "conf"
        gobblin_config_template = conf_dir / "gobblin-mapreduce.properties.template"
        gobblin_config = conf_dir / "gobblin-mapreduce.properties"
        copy(gobblin_config_template, gobblin_config)

        utils.re_edit_in_place(gobblin_config, {r"fs.uri=hdfs://localhost:8020": "fs.uri=hdfs://%s" % hdfs_endpoint})
Пример #4
0
    def install(self):
        self.dist_config.add_users()
        self.dist_config.add_dirs()
        jujuresources.install(self.resources['livy'],
                              destination=self.dist_config.path('livy'),
                              skip_top_level=False)

        livy_bin = self.dist_config.path('livy') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if livy_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], livy_bin])
            # Following classpath comes from `hadoop classpath` and should be fixed
            hadoop_cp = '/etc/hadoop/conf:/usr/lib/hadoop/share/hadoop/common/lib/*:/usr/lib/hadoop/share/hadoop/common/*\
:/usr/lib/hadoop/share/hadoop/hdfs:/usr/lib/hadoop/share/hadoop/hdfs/lib/*\
:/usr/lib/hadoop/share/hadoop/hdfs/*:/usr/lib/hadoop/share/hadoop/yarn/lib/*\
:/usr/lib/hadoop/share/hadoop/yarn/*:/usr/lib/hadoop/share/hadoop/mapreduce/lib/*\
:/usr/lib/hadoop/share/hadoop/mapreduce/*:/usr/lib/hadoop/contrib/capacity-scheduler/*.jar'

            env['CLASSPATH'] = hadoop_cp

        cmd = "chown -R {}:hadoop {}".format(self.user,
                                             self.dist_config.path('livy'))
        call(cmd.split())
        cmd = "chown -R {}:hadoop {}".format(
            self.user, self.dist_config.path('livy_conf'))
        call(cmd.split())
Пример #5
0
def installoracle():
    hookenv.log('Installing Oracle JDK')
    filesdir = '{}/files/'.format(charm_dir())
    conf = hookenv.config()
    (tarname, dirname) = get_java_paths(filesdir, conf['install-type'], conf['java-major'])
    destdir = "/opt/java/{}".format(dirname)
    if not os.path.isdir(destdir):
        tfile = tarfile.open(
            '{}/files/{}'.format(charm_dir(), tarname), 'r')
        # Important to note that the following extraction is
        # UNSAFE since .tar.gz archive could contain
        # relative path like ../../ and overwrite other dirs
        extractdir = '{}/{}'.format(filesdir, dirname)
        tfile.extractall(filesdir)
        mergecopytree(extractdir, destdir)
        # Set defaults
        subprocess.check_output(['update-alternatives', '--install', '/usr/bin/java', 'java', '{}/jre/bin/java'.format(destdir), '2000'])
        subprocess.check_output(['update-alternatives', '--install', '/usr/bin/javac', 'javac', '{}/bin/javac'.format(destdir), '2000'])
        # set env vars
        with utils.environment_edit_in_place('/etc/environment') as env:
            # ensure that correct java is used
            env['JAVA_HOME'] = destdir
            env['J2SDKDIR'] = destdir
            env['J2REDIR'] = '{}/jre'.format(destdir)
            env['DERBY_HOME'] = '{}/db'.format(destdir)
            if destdir not in env['PATH']:
                env['PATH'] = ':'.join([
                    '{}/bin'.format(env['JAVA_HOME']),
                    '{}/bin'.format(env['J2REDIR']),
                    '{}/bin'.format(env['DERBY_HOME']),
                    env['PATH'],
                ])
Пример #6
0
    def setup_gobblin(self, host, port):
        '''
        Configure Gobblin. Each time something changes (eg) a new Haddop endpoint is present this method must be called.

        :param str ip: IP of the HDFS endpoint.
        :param str port: Port of the HDFS endpoint.
        '''

        # Setup the environment
        gobblin_bin = self.dist_config.path('gobblin') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if gobblin_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], gobblin_bin])
            env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin'
            env['GOBBLIN_WORK_DIR'] = "/user/gobblin/work"

        hdfs_endpoint = ''.join([host, ':', port])

        # Setup gobblin configuration
        conf_dir = self.dist_config.path('gobblin') / 'conf'
        gobblin_config_template = conf_dir / 'gobblin-mapreduce.properties.template'
        gobblin_config = conf_dir / 'gobblin-mapreduce.properties'
        copy(gobblin_config_template, gobblin_config)

        utils.re_edit_in_place(gobblin_config, {
            r'fs.uri=hdfs://localhost:8020':
            'fs.uri=hdfs://%s' % hdfs_endpoint,
        })
    def setup_zookeeper_config(self):        
        """
        Setup Zookeeper configuration based on default config.

        Copy the default configuration files to zookeeper_conf property
        defined in dist.yaml
        """
        default_conf = self.dist_config.path('zookeeper') / 'conf'
        zookeeper_conf = self.dist_config.path('zookeeper_conf')
        zookeeper_conf.rmtree_p()
        default_conf.copytree(zookeeper_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        default_conf.rmtree_p()
        zookeeper_conf.symlink(default_conf)

        zoo_cfg = zookeeper_conf / 'zoo.cfg'
        if not zoo_cfg.exists():
            (zookeeper_conf / 'zoo_sample.cfg').copy(zoo_cfg)
        utils.re_edit_in_place(zoo_cfg, {
            r'^dataDir.*': 'dataDir={}'.format(self.dist_config.path('zookeeper_data_dir')),
        })

        # Configure zookeeper environment for all users
        zookeeper_bin = self.dist_config.path('zookeeper') / 'bin'
        zookeeper_rest = self.dist_config.path('zookeeper') / 'src/contrib/rest'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if zookeeper_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], zookeeper_bin])
            env['ZOOCFGDIR'] = self.dist_config.path('zookeeper_conf')
            env['ZOO_BIN_DIR'] = zookeeper_bin
            env['ZOO_LOG_DIR'] = self.dist_config.path('zookeeper_log_dir')
            env['ZOO_REST'] = zookeeper_rest
    def configure_flume(self, template_data=None):
        '''
        handle configuration of Flume and setup the environment
        '''
        render(
            source='flume.conf.j2',
            target=self.config_file,
            context=dict({
                'dist_config': self.dist_config,
            }, **(template_data or {})),
            filters={
                'agent_list': lambda agents, prefix='': ','.join([
                    '%s%s' % (prefix, a['name']) for a in agents
                ]),
            },
        )

        flume_bin = self.dist_config.path('flume') / 'bin'
        java_symlink = check_output(
            ["readlink", "-f", "/usr/bin/java"]).decode('utf8')
        java_home = re.sub('/bin/java', '', java_symlink).rstrip()
        with utils.environment_edit_in_place('/etc/environment') as env:
            if flume_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], flume_bin])
            env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf')
            env['FLUME_CLASSPATH'] = self.dist_config.path('flume') / 'lib'
            env['FLUME_HOME'] = self.dist_config.path('flume')
            env['JAVA_HOME'] = java_home
Пример #9
0
    def setup_hive_config(self):
        '''
        copy the default configuration files to hive_conf property
        defined in dist.yaml
        '''
        default_conf = self.dist_config.path('hive') / 'conf'
        hive_conf = self.dist_config.path('hive_conf')
        hive_conf.rmtree_p()
        default_conf.copytree(hive_conf)

        # Configure immutable bits
        hive_bin = self.dist_config.path('hive') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if hive_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], hive_bin])
            env['HIVE_CONF_DIR'] = self.dist_config.path('hive_conf')

        hive_env = self.dist_config.path('hive_conf') / 'hive-env.sh'
        if not hive_env.exists():
            (self.dist_config.path('hive_conf') / 'hive-env.sh.template').copy(hive_env)

        hive_site = self.dist_config.path('hive_conf') / 'hive-site.xml'
        if not hive_site.exists():
            (self.dist_config.path('hive_conf') / 'hive-default.xml.template').copy(hive_site)
        with utils.xmlpropmap_edit_in_place(hive_site) as props:
            # TODO (kwm): we should be able to export java.io.tmpdir so these 4 arent needed
            props['hive.exec.local.scratchdir'] = "/tmp/hive"
            props['hive.downloaded.resources.dir'] = "/tmp/hive_resources"
            props['hive.querylog.location'] = "/tmp/hive"
            props['hive.server2.logging.operation.log.location'] = "/tmp/hive"
            ####

        # create hdfs storage space
        utils.run_as('hive', 'hdfs', 'dfs', '-mkdir', '-p', '/user/hive/warehouse')
Пример #10
0
    def install(self):
        '''
        Fetch resources
        '''
        self.dist_config.add_users()
        self.dist_config.add_dirs()

        result = resource_get('tomee')
        if not result:
            log("Failed to fetch TomEE resource")
            return False

        unitdata.kv().set("tomeetarball", result)
        log("TomEE tarball path is {}".format(result))
        tomee_install_dir = self.dist_config.path('tomee_dir')
        with chdir(tomee_install_dir):
            utils.run_as('tomcat', 'tar', '-zxvf', '{}'.format(result))

        tomee_dirs = [f for f in os.listdir(tomee_install_dir)
                      if f.startswith('apache-tomee')]
        catalina_home = os.path.join(tomee_install_dir, tomee_dirs[0])
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['CATALINA_HOME'] = catalina_home
        unitdata.kv().set("catalina_home", catalina_home)
        self.open_ports()
        return True
Пример #11
0
    def configure_hadoop(self):
        java_home = Path(unitdata.kv().get('java.home'))
        java_bin = java_home / 'bin'
        hadoop_bin = self.dist_config.path('hadoop') / 'bin'
        hadoop_sbin = self.dist_config.path('hadoop') / 'sbin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['JAVA_HOME'] = java_home
            if java_bin not in env['PATH']:
                env['PATH'] = ':'.join([java_bin, env['PATH']])  # ensure that correct java is used
            if hadoop_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], hadoop_bin])
            if hadoop_sbin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], hadoop_sbin])
            env['HADOOP_LIBEXEC_DIR'] = self.dist_config.path('hadoop') / 'libexec'
            env['HADOOP_INSTALL'] = self.dist_config.path('hadoop')
            env['HADOOP_HOME'] = self.dist_config.path('hadoop')
            env['HADOOP_COMMON_HOME'] = self.dist_config.path('hadoop')
            env['HADOOP_HDFS_HOME'] = self.dist_config.path('hadoop')
            env['HADOOP_MAPRED_HOME'] = self.dist_config.path('hadoop')
            env['HADOOP_YARN_HOME'] = self.dist_config.path('hadoop')
            env['YARN_HOME'] = self.dist_config.path('hadoop')
            env['HADOOP_CONF_DIR'] = self.dist_config.path('hadoop_conf')
            env['YARN_CONF_DIR'] = self.dist_config.path('hadoop_conf')
            env['YARN_LOG_DIR'] = self.dist_config.path('yarn_log_dir')
            env['HDFS_LOG_DIR'] = self.dist_config.path('hdfs_log_dir')
            env['HADOOP_LOG_DIR'] = self.dist_config.path('hdfs_log_dir')  # for hadoop 2.2.0 only
            env['MAPRED_LOG_DIR'] = '/var/log/hadoop/mapred'  # should be moved to config, but could
            env['MAPRED_PID_DIR'] = '/var/run/hadoop/mapred'  # be destructive for mapreduce operation

        hadoop_env = self.dist_config.path('hadoop_conf') / 'hadoop-env.sh'
        utils.re_edit_in_place(hadoop_env, {
            r'export JAVA_HOME *=.*': 'export JAVA_HOME=%s' % java_home,
        })
Пример #12
0
    def configure(self):
        '''
        Configure spark environment for all users
        '''
        spark_home = self.dist_config.path('spark')
        spark_bin = spark_home / 'bin'

        # put our jar in hdfs
        spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format(spark_home))[0]
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/share/lib')
        try:
            utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar, '/user/ubuntu/share/lib/spark-assembly.jar')
        except CalledProcessError:
            print ("File exists")

        # update environment variables
        with utils.environment_edit_in_place('/etc/environment') as env:
            if spark_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], spark_bin])
            env['MASTER'] = hookenv.config('spark_execution_mode')
            env['PYSPARK_DRIVER_PYTHON'] = "ipython"
            env['SPARK_CONF_DIR'] = self.dist_config.path('spark_conf')
            env['SPARK_HOME'] = spark_home
            env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar"

        # update spark config
        spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf'
        utils.re_edit_in_place(spark_conf, {
            r'.*spark.eventLog.enabled *.*': 'spark.eventLog.enabled    true',
            r'.*spark.eventLog.dir *.*': 'spark.eventLog.dir    hdfs:///user/ubuntu/directory',
            })
Пример #13
0
    def configure_flume(self, template_data=None):
        '''
        handle configuration of Flume and setup the environment
        '''
        render(
            source='flume.conf.j2',
            target=self.config_file,
            context=dict({
                'dist_config': self.dist_config,
            }, **(template_data or {})),
            filters={
                'agent_list':
                lambda agents, prefix='': ','.join(
                    ['%s%s' % (prefix, a['name']) for a in agents]),
            },
        )

        flume_bin = self.dist_config.path('flume') / 'bin'
        java_symlink = check_output(["readlink", "-f",
                                     "/usr/bin/java"]).decode('utf8')
        java_home = re.sub('/bin/java', '', java_symlink).rstrip()
        with utils.environment_edit_in_place('/etc/environment') as env:
            if flume_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], flume_bin])
            env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf')
            env['FLUME_CLASSPATH'] = self.dist_config.path('flume') / 'lib'
            env['FLUME_HOME'] = self.dist_config.path('flume')
            env['JAVA_HOME'] = java_home
Пример #14
0
 def update_config(self, mode):
     """
     Configure Pig with the correct classpath.  If Hadoop is available, use
     HADOOP_CONF_DIR, otherwise use PIG_HOME.
     """
     with utils.environment_edit_in_place('/etc/environment') as env:
         key = 'HADOOP_CONF_DIR' if mode == 'mapreduce' else 'PIG_HOME'
         env['PIG_CLASSPATH'] = env[key]
Пример #15
0
    def setup_hue(self, namenodes, resourcemanagers, hdfs_port, yarn_port, yarn_http, yarn_ipc):
        hookenv.status_set('maintenance', 'Setting up Hue')
        hue_bin = self.dist_config.path('hue') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if hue_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], hue_bin])
            env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin'
            env['GOBBLIN_WORK_DIR'] = self.dist_config.path('outputdir')
            hadoop_conf = env['HADOOP_CONF_DIR'] + '/core-site.xml'
            yarn_conf = env['HADOOP_CONF_DIR'] + '/yarn-site.xml'
            mapred_conf = env['HADOOP_CONF_DIR'] + '/mapred-site.xml'

        with utils.xmlpropmap_edit_in_place(hadoop_conf) as props:
            hdfs_endpoint = props['fs.defaultFS']

        with utils.xmlpropmap_edit_in_place(yarn_conf) as props:
            yarn_log_url = props['yarn.log.server.url'] # 19888
            yarn_resmgr = props['yarn.resourcemanager.address'] # 8032

        with utils.xmlpropmap_edit_in_place(mapred_conf) as props:
            mapred_jobhistory = props['mapreduce.jobhistory.address'] # 10020

        default_conf = self.dist_config.path('hue') / 'desktop/conf'
        hue_conf = self.dist_config.path('hue_conf')

        if os.path.islink('/usr/lib/hue/desktop/conf'):
                return
        else:
                hue_conf.rmtree_p()
                default_conf.copytree(hue_conf)
                # Now remove the conf included in the tarball and symlink our real conf
                default_conf.rmtree_p()
                hue_conf.symlink(default_conf)
        
        hdfs_fulluri = hdfs_endpoint.split('/')[2]
        hdfs_hostname = hdfs_fulluri.split(':')[0]

        hue_config = ''.join((self.dist_config.path('hue'), '/desktop/conf/hue.ini'))
        hue_port = self.dist_config.port('hue_web')

        # Fix following for HA: http://docs.hortonworks.com/HDPDocuments/HDP2/HDP-2.3.0/bk_hadoop-ha/content/ha-nn-deploy-hue.html
        hookenv.log("Not currently supporting HA, FIX: namenodes are: " + str(namenodes) + " resmanagers: " + str(resourcemanagers))
        utils.re_edit_in_place(hue_config, {
            r'http_port=8888': 'http_port=%s' % hue_port,
            #r'fs_defaultfs=hdfs://localhost:8020': 'fs_defaultfs=%s' % hdfs_endpoint,
            r'fs_defaultfs=hdfs://localhost:8020': 'fs_defaultfs=%s:%s' % (namenodes[0], hdfs_port),
            #r'## resourcemanager_host=localhost': 'resourcemanager_host=%s' % yarn_resmgr.split(':')[0],
            r'.*resourcemanager_host=localhost': 'resourcemanager_host=%s' % resourcemanagers[0],
            #r'## resourcemanager_port=8032': 'resourcemanager_port=%s' % yarn_resmgr.split(':')[1],
            r'.*resourcemanager_port=8032': 'resourcemanager_port=%s' % yarn_port,
            r'.*webhdfs_url=http://localhost:50070/webhdfs/v1': 'webhdfs_url=http://%s:50070/webhdfs/v1' % namenodes[0],
            r'.*history_server_api_url=http://localhost:19888': 'history_server_api_url=%s' % yarn_log_url.split('/')[0],
            r'.*resourcemanager_api_url=http://localhost:8088': 'resourcemanager_api_url=http://%s:8088' % yarn_resmgr.split(':')[0],
            r'.*secret_key=.*': 'secret_key=%s' % uuid.uuid4()
            })

        self.update_apps()
Пример #16
0
    def configure_zeppelin(self):
        '''
        Configure zeppelin environment for all users
        '''
        zeppelin_bin = self.dist_config.path('zeppelin') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if zeppelin_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], zeppelin_bin])
            env['ZEPPELIN_CONF_DIR'] = self.dist_config.path('zeppelin_conf')

        zeppelin_site = self.dist_config.path(
            'zeppelin_conf') / 'zeppelin-site.xml'
        with utils.xmlpropmap_edit_in_place(zeppelin_site) as xml:
            xml['zeppelin.server.port'] = self.dist_config.port('zeppelin')
            xml['zeppelin.notebook.dir'] = self.dist_config.path(
                'zeppelin_notebooks')

        etc_env = utils.read_etc_env()
        hadoop_conf_dir = etc_env.get('HADOOP_CONF_DIR', '/etc/hadoop/conf')
        spark_home = etc_env.get('SPARK_HOME', '/usr/lib/spark')
        spark_driver_mem = etc_env.get('SPARK_DRIVER_MEMORY', '1g')
        spark_exe_mode = os.environ.get('MASTER', 'yarn-client')
        spark_executor_mem = etc_env.get('SPARK_EXECUTOR_MEMORY', '1g')
        zeppelin_env = self.dist_config.path(
            'zeppelin_conf') / 'zeppelin-env.sh'
        with open(zeppelin_env, "a") as f:
            f.write('export ZEPPELIN_HOME={}\n'.format(
                self.dist_config.path('zeppelin')))
            f.write(
                'export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n'
                .format(spark_driver_mem, spark_executor_mem))
            f.write('export ZEPPELIN_LOG_DIR={}\n'.format(
                self.dist_config.path('zeppelin_logs')))
            f.write(
                'export ZEPPELIN_MEM="-Xms128m -Xmx1024m -XX:MaxPermSize=512m"\n'
            )
            f.write('export ZEPPELIN_NOTEBOOK_DIR={}\n'.format(
                self.dist_config.path('zeppelin_notebooks')))
            f.write('export SPARK_HOME={}\n'.format(spark_home))
            f.write(
                'export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n'
                .format(spark_driver_mem, spark_executor_mem))
            f.write('export HADOOP_CONF_DIR={}\n'.format(hadoop_conf_dir))
            f.write(
                'export PYTHONPATH={s}/python:{s}/python/lib/py4j-0.8.2.1-src.zip\n'
                .format(s=spark_home))
            f.write('export MASTER={}\n'.format(spark_exe_mode))

        # User needs write access to zepp's conf to write interpreter.json
        # on server start. chown the whole conf dir, though we could probably
        # touch that file and chown it, leaving the rest owned as root:root.
        # TODO: weigh implications of have zepp's conf dir owned by non-root.
        cmd = "chown -R ubuntu:hadoop {}".format(
            self.dist_config.path('zeppelin_conf'))
        call(cmd.split())
Пример #17
0
 def setup_etc_env(self):
     '''
     Write some niceties to /etc/environment
     '''
     # Configure system-wide bits
     zeppelin_bin = self.dist_config.path('zeppelin') / 'bin'
     zeppelin_conf = self.dist_config.path('zeppelin_conf')
     with utils.environment_edit_in_place('/etc/environment') as env:
         if zeppelin_bin not in env['PATH']:
             env['PATH'] = ':'.join([env['PATH'], zeppelin_bin])
         env['ZEPPELIN_CONF_DIR'] = zeppelin_conf
Пример #18
0
 def setup_etc_env(self):
     '''
     Write some niceties to /etc/environment
     '''
     # Configure system-wide bits
     zeppelin_bin = self.dist_config.path('zeppelin') / 'bin'
     zeppelin_conf = self.dist_config.path('zeppelin_conf')
     with utils.environment_edit_in_place('/etc/environment') as env:
         if zeppelin_bin not in env['PATH']:
             env['PATH'] = ':'.join([env['PATH'], zeppelin_bin])
         env['ZEPPELIN_CONF_DIR'] = zeppelin_conf
Пример #19
0
    def configure_yarn_mode(self):
        # put the spark jar in hdfs
        spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format(
            self.dist_config.path('spark')))[0]
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p',
                     '/user/ubuntu/share/lib')
        try:
            utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar,
                         '/user/ubuntu/share/lib/spark-assembly.jar')
        except CalledProcessError:
            pass  # jar already in HDFS from another Spark

        with utils.environment_edit_in_place('/etc/environment') as env:
            env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar"

        # create hdfs storage space for history server
        dc = self.dist_config
        prefix = dc.path('log_prefix')
        events_dir = dc.path('spark_events')
        events_dir = 'hdfs:///{}'.format(events_dir.replace(prefix, ''))
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', events_dir)
        utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop',
                     events_dir)

        # create hdfs storage space for spark-bench
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p',
                     '/user/ubuntu/spark-bench')
        utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop',
                     '/user/ubuntu/spark-bench')

        # ensure user-provided Hadoop works
        hadoop_classpath = utils.run_as('hdfs',
                                        'hadoop',
                                        'classpath',
                                        capture_output=True)
        spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh'
        utils.re_edit_in_place(spark_env, {
            r'.*SPARK_DIST_CLASSPATH.*':
            'SPARK_DIST_CLASSPATH={}'.format(hadoop_classpath),
        },
                               append_non_matches=True)

        # update spark-defaults
        spark_conf = self.dist_config.path(
            'spark_conf') / 'spark-defaults.conf'
        etc_env = utils.read_etc_env()
        utils.re_edit_in_place(spark_conf, {
            r'.*spark.master .*':
            'spark.master {}'.format(self.get_master()),
        },
                               append_non_matches=True)

        unitdata.kv().set('hdfs.available', True)
        unitdata.kv().flush(True)
Пример #20
0
 def initial_pig_config(self):
     '''
     Configure system-wide pig bits.
     '''
     pig_bin = self.dist_config.path('pig') / 'bin'
     with utils.environment_edit_in_place('/etc/environment') as env:
         if pig_bin not in env['PATH']:
             env['PATH'] = ':'.join([env['PATH'], pig_bin])
         env['PIG_CONF_DIR'] = self.dist_config.path('pig_conf')
         env['PIG_HOME'] = self.dist_config.path('pig')
         env['HADOOP_CONF_DIR'] = self.dist_config.path('hadoop_conf')
Пример #21
0
 def configure_flume(self):
     flume_bin = self.dist_config.path('flume') / 'bin'
     java_symlink = check_output(["readlink", "-f", "/usr/bin/java"])
     java_home = re.sub('/bin/java', '', java_symlink).rstrip()
     java_cp = "{}".format(self.dist_config.path('flume') / 'lib')
     with utils.environment_edit_in_place('/etc/environment') as env:
         if flume_bin not in env['PATH']:
             env['PATH'] = ':'.join([env['PATH'], flume_bin])
         env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf')
         env['FLUME_CLASSPATH'] = java_cp
         env['FLUME_HOME'] = self.dist_config.path('flume')
         env['JAVA_HOME'] = java_home
Пример #22
0
 def set_oozie_env(self):
     oozie_bin = self.dist_config.path('oozie') / 'bin'
     with utils.environment_edit_in_place('/etc/environment') as env:
         if oozie_bin not in env['PATH']:
             env['PATH'] = ':'.join([env['PATH'], oozie_bin])
         env['OOZIE_HOME'] = self.dist_config.path('oozie') / 'libexec'
         env['OOZIE_CONFIG'] = self.dist_config.path('oozie_conf')
         env['OOZIE_DATA'] = self.dist_config.path('oozie_data')
         env['OOZIE_LOG'] = self.dist_config.path('oozie_log')
         env['CATALINA_BASE'] = self.dist_config.path('oozie') / 'oozie-server'
         env['CATALINA_TMPDIR'] = '/tmp'
         env['CATALINA_PID'] = '/tmp/oozie.pid'
Пример #23
0
def install_mahout():
    hookenv.status_set('maintenance', 'installing mahout')
    bigtop = Bigtop()
    bigtop.render_site_yaml(roles=[
        'mahout-client',
    ], )
    bigtop.trigger_puppet()
    with utils.environment_edit_in_place('/etc/environment') as env:
        env['MAHOUT_HOME'] = '/usr/lib/mahout'

    hookenv.status_set('active', 'ready')
    set_state('mahout.installed')
Пример #24
0
 def configure_flume(self):
     flume_bin = self.dist_config.path('flume') / 'bin'
     java_symlink = check_output(["readlink", "-f", "/usr/bin/java"])
     java_home = re.sub('/bin/java', '', java_symlink).rstrip()
     java_cp = "{}".format(self.dist_config.path('flume') / 'lib')
     with utils.environment_edit_in_place('/etc/environment') as env:
         if flume_bin not in env['PATH']:
             env['PATH'] = ':'.join([env['PATH'], flume_bin])
         env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf')
         env['FLUME_CLASSPATH'] = java_cp
         env['FLUME_HOME'] = self.dist_config.path('flume')
         env['JAVA_HOME'] = java_home
Пример #25
0
    def setup_kafka_config(self):
        '''
        copy the default configuration files to kafka_conf property
        defined in dist.yaml
        '''
        default_conf = self.dist_config.path('kafka') / 'config'
        kafka_conf = self.dist_config.path('kafka_conf')
        kafka_conf.rmtree_p()
        default_conf.copytree(kafka_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        # dir. we've seen issues where kafka still looks for config in
        # KAFKA_HOME/config.
        default_conf.rmtree_p()
        kafka_conf.symlink(default_conf)

        # Configure immutable bits
        kafka_bin = self.dist_config.path('kafka') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if kafka_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], kafka_bin])
            env['LOG_DIR'] = self.dist_config.path('kafka_app_logs')

        # note: we set the advertised.host.name below to the public_address
        # to ensure that external (non-Juju) clients can connect to Kafka
        public_address = hookenv.unit_get('public-address')
        private_ip = utils.resolve_private_address(
            hookenv.unit_get('private-address'))
        kafka_server_conf = self.dist_config.path(
            'kafka_conf') / 'server.properties'
        service, unit_num = os.environ['JUJU_UNIT_NAME'].split('/', 1)
        utils.re_edit_in_place(
            kafka_server_conf, {
                r'^broker.id=.*':
                'broker.id=%s' % unit_num,
                r'^port=.*':
                'port=%s' % self.dist_config.port('kafka'),
                r'^log.dirs=.*':
                'log.dirs=%s' % self.dist_config.path('kafka_data_logs'),
                r'^#?advertised.host.name=.*':
                'advertised.host.name=%s' % public_address,
            })

        kafka_log4j = self.dist_config.path('kafka_conf') / 'log4j.properties'
        utils.re_edit_in_place(
            kafka_log4j, {
                r'^kafka.logs.dir=.*':
                'kafka.logs.dir=%s' % self.dist_config.path('kafka_app_logs'),
            })

        # fix for lxc containers and some corner cases in manual provider
        # ensure that public_address is resolvable internally by mapping it to the private IP
        utils.update_kv_host(private_ip, public_address)
        utils.manage_etc_hosts()
Пример #26
0
    def setup_kafka_config(self):
        '''
        copy the default configuration files to kafka_conf property
        defined in dist.yaml
        '''
        default_conf = self.dist_config.path('kafka') / 'config'
        kafka_conf = self.dist_config.path('kafka_conf')
        kafka_conf.rmtree_p()
        default_conf.copytree(kafka_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        # dir. we've seen issues where kafka still looks for config in
        # KAFKA_HOME/config.
        default_conf.rmtree_p()
        kafka_conf.symlink(default_conf)

        # Configure immutable bits
        kafka_bin = self.dist_config.path('kafka') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if kafka_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], kafka_bin])
            env['LOG_DIR'] = self.dist_config.path('kafka_app_logs')

        # note: we set the advertised.host.name below to the public_address
        # to ensure that external (non-Juju) clients can connect to Kafka
        public_address = hookenv.unit_get('public-address')
        private_ip = utils.resolve_private_address(hookenv.unit_get('private-address'))
        kafka_server_conf = self.dist_config.path('kafka_conf') / 'server.properties'
        service, unit_num = os.environ['JUJU_UNIT_NAME'].split('/', 1)
        utils.re_edit_in_place(kafka_server_conf, {
            r'^broker.id=.*': 'broker.id=%s' % unit_num,
            r'^port=.*': 'port=%s' % self.dist_config.port('kafka'),
            r'^log.dirs=.*': 'log.dirs=%s' % self.dist_config.path('kafka_data_logs'),
            r'^#?advertised.host.name=.*': 'advertised.host.name=%s' % public_address,
        })

        kafka_log4j = self.dist_config.path('kafka_conf') / 'log4j.properties'
        utils.re_edit_in_place(kafka_log4j, {
            r'^kafka.logs.dir=.*': 'kafka.logs.dir=%s' % self.dist_config.path('kafka_app_logs'),
        })

        # fix for lxc containers and some corner cases in manual provider
        # ensure that public_address is resolvable internally by mapping it to the private IP
        utils.update_etc_hosts({private_ip: public_address})

        templating.render(
            'upstart.conf',
            '/etc/init/kafka.conf',
            context={
                'kafka_conf': self.dist_config.path('kafka_conf'),
                'kafka_bin': '{}/bin'.format(self.dist_config.path('kafka'))
            },
        )
    def configure_notebook(self):
        # profile config created during install
        ipython_profile = "ipython_notebook_config.py"
        # find path to ipython_notebook_config.py
        pPath = "/home/ubuntu/.ipython/profile_pyspark"
        cmd = ['find', pPath, '-name', ipython_profile]
        profile_config = check_output(cmd, universal_newlines=True).strip()

        # update profile with standard opts and configured port
        port = self.dist_config.port('notebook')
        notebooks_dir = self.dist_config.path('notebooks')
        utils.re_edit_in_place(profile_config, {
            r'.*c.NotebookApp.ip *=.*':
            'c.NotebookApp.ip = "*"',

            r'.*c.NotebookApp.open_browser *=.*':
            'c.NotebookApp.open_browser = False',

            r'.*c.NotebookApp.port *=.*':
            'c.NotebookApp.port = {}'.format(port),

            r'.*c.NotebookManager.notebook_dir *=.*':
            "c.NotebookManager.notebook_dir = u'{}'".format(notebooks_dir),
        })

        spark_home = os.environ.get("SPARK_HOME", '/usr/lib/spark')
        py4j = "py4j-0.*.zip"
        cmd = "find {} -name {}".format(spark_home, py4j)
        # TODO: handle missing py4j
        py4j_path = check_output(cmd.split(), universal_newlines=True).strip()

        setup_source = 'scripts/00-pyspark-setup.py'
        Path(setup_source).chmod(0o755)
        Path(setup_source).chown('ubuntu', 'hadoop')
        utils.re_edit_in_place(setup_source, {
            r'py4j *=.*': 'py4j="{}"'.format(py4j_path),
        })
        home = Path(os.environ.get('HOME', '/home/ubuntu'))
        profile_dir = home / '.ipython/profile_pyspark'
        setup_target = profile_dir / 'startup/00-pyspark-setup.py'
        Path(setup_source).copy2(setup_target)

        # Our spark charm defaults to yarn-client, so that should
        # be a safe default here in case MASTER isn't set. Update the env
        # with our spark mode and py4j location.
        spark_mode = os.environ.get("MASTER", "yarn-client")
        spark_home = Path(os.environ.get("SPARK_HOME", "/usr/lib/spark"))
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['PYSPARK_DRIVER_PYTHON_OPTS'] = "notebook"
            env['PYSPARK_SUBMIT_ARGS'] = "--master " + spark_mode
            env['PYTHONPATH'] = spark_home / py4j_path
    def disable_yarn_mode(self):
        # put the spark jar in hdfs
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['SPARK_JAR'] = glob('{}/lib/spark-assembly-*.jar'.format(
                                    self.dist_config.path('spark')))[0]

        # update spark-defaults
        spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf'
        utils.re_edit_in_place(spark_conf, {
            r'.*spark.master .*': 'spark.master {}'.format(self.get_master()),
        }, append_non_matches=True)

        unitdata.kv().set('hdfs.available', False)
        unitdata.kv().flush(True)
Пример #29
0
def install_mahout():
    hookenv.status_set('maintenance', 'installing mahout')
    bigtop = Bigtop()
    bigtop.render_site_yaml(
        roles=[
            'mahout-client',
        ],
    )
    bigtop.trigger_puppet()
    with utils.environment_edit_in_place('/etc/environment') as env:
        env['MAHOUT_HOME'] = '/usr/lib/mahout'

    hookenv.status_set('active', 'ready')
    set_state('mahout.installed')
Пример #30
0
    def configure_yarn_mode(self):
        # put the spark jar in hdfs
        spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format(
                                  self.dist_config.path('spark')))[0]
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p',
                     '/user/ubuntu/share/lib')
        try:
            utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar,
                         '/user/ubuntu/share/lib/spark-assembly.jar')
        except CalledProcessError:
            pass  # jar already in HDFS from another Spark

        with utils.environment_edit_in_place('/etc/environment') as env:
            env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar"

        # create hdfs storage space for history server
        dc = self.dist_config
        prefix = dc.path('log_prefix')
        events_dir = dc.path('spark_events')
        events_dir = 'hdfs:///{}'.format(events_dir.replace(prefix, ''))
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', events_dir)
        utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop',
                     events_dir)

        # create hdfs storage space for spark-bench
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p',
                     '/user/ubuntu/spark-bench')
        utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop',
                     '/user/ubuntu/spark-bench')

        # ensure user-provided Hadoop works
        hadoop_classpath = utils.run_as('hdfs', 'hadoop', 'classpath',
                                        capture_output=True)
        spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh'
        utils.re_edit_in_place(spark_env, {
            r'.*SPARK_DIST_CLASSPATH.*': 'SPARK_DIST_CLASSPATH={}'.format(hadoop_classpath),
        }, append_non_matches=True)

        # update spark-defaults
        spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf'
        etc_env = utils.read_etc_env()
        hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '')
        utils.re_edit_in_place(spark_conf, {
            r'.*spark.master .*': 'spark.master {}'.format(self.get_master()),
            r'.*spark.driver.extraClassPath .*': 'spark.driver.extraClassPath {}'.format(hadoop_extra_classpath),
        }, append_non_matches=True)

        unitdata.kv().set('hdfs.available', True)
        unitdata.kv().flush(True)
Пример #31
0
def superset_startup():

    superset_dir = '/home/ubuntu/superset'
    db_uri = 'sqlite:////home/ubuntu/.superset/superset.db'
    context = {'db_uri': db_uri}
    host.mkdir(superset_dir)
    templating.render(source='superset_config.py.jinja2',
                      target=superset_dir + '/superset_config.py',
                      context=context)

    with utils.environment_edit_in_place('/etc/environment') as env:
        # Appending superset_config.py to the PYTHONPATH
        env['PYTHONPATH'] = "$PYTHONPATH:%s" % (superset_dir +
                                                '/superset_config.py')

    # Create an admin user (you will be prompted to set username, first and last name before setting a password)
    # Username [admin]:
    # User first name [admin]:
    # User last name [user]:
    # Email [[email protected]]:
    # Password:
    # Repeat for confirmation:
    hookenv.log('Creating admin user for Superset')
    child = pexpect.spawn(
        "su - ubuntu -c \"fabmanager create-admin --app superset\"")
    child.expect('\\r\\nUsername \[admin\]: ')
    child.sendline()
    child.expect('\\r\\nUser first name \[admin\]: ')
    child.sendline()
    child.expect('\\r\\nUser last name \[user\]: ')
    child.sendline()
    child.expect('\\r\\nEmail \[[email protected]\]: ')
    child.sendline()
    child.expect('\\r\\nPassword: '******'admin')
    child.expect('\\r\\nRepeat for confirmation: ')
    child.sendline('admin')

    # Create default roles and permissions
    hookenv.log('Create default roles and permissions')
    #subprocess.check_call(['superset', 'init'])
    subprocess.check_call(['su', '-', 'ubuntu', '-c', 'superset init'])
    # Start the web server on port 8088, use -p to bind to another port
    hookenv.log('Start the web server on port 8088')
    #subprocess.Popen(['superset', 'runserver'])
    subprocess.Popen(['su', '-', 'ubuntu', '-c', 'superset runserver'])

    set_state('superset.ready')
    status_set('active', 'Superset up and running')
Пример #32
0
def install_mahout():
    hookenv.status_set('maintenance', 'installing mahout')
    bigtop = Bigtop()
    bigtop.render_site_yaml(roles=[
        'mahout-client',
    ], )
    bigtop.trigger_puppet()
    with utils.environment_edit_in_place('/etc/environment') as env:
        env['MAHOUT_HOME'] = '/usr/lib/mahout'

    set_state('mahout.installed')
    hookenv.status_set('active', 'ready')
    # set app version string for juju status output
    mahout_version = get_package_version('mahout') or 'unknown'
    hookenv.application_version_set(mahout_version)
Пример #33
0
    def configure_zeppelin(self):
        """
        Configure zeppelin environment for all users
        """
        zeppelin_bin = self.dist_config.path("zeppelin") / "bin"
        with utils.environment_edit_in_place("/etc/environment") as env:
            if zeppelin_bin not in env["PATH"]:
                env["PATH"] = ":".join([env["PATH"], zeppelin_bin])
            env["ZEPPELIN_CONF_DIR"] = self.dist_config.path("zeppelin_conf")

        zeppelin_site = self.dist_config.path("zeppelin_conf") / "zeppelin-site.xml"
        with utils.xmlpropmap_edit_in_place(zeppelin_site) as xml:
            xml["zeppelin.server.port"] = self.dist_config.port("zeppelin")
            xml["zeppelin.notebook.dir"] = self.dist_config.path("zeppelin_notebooks")

        etc_env = utils.read_etc_env()
        hadoop_conf_dir = etc_env.get("HADOOP_CONF_DIR", "/etc/hadoop/conf")
        spark_home = etc_env.get("SPARK_HOME", "/usr/lib/spark")
        spark_driver_mem = etc_env.get("SPARK_DRIVER_MEMORY", "1g")
        spark_exe_mode = os.environ.get("MASTER", "yarn-client")
        spark_executor_mem = etc_env.get("SPARK_EXECUTOR_MEMORY", "1g")
        zeppelin_env = self.dist_config.path("zeppelin_conf") / "zeppelin-env.sh"
        with open(zeppelin_env, "a") as f:
            f.write("export ZEPPELIN_HOME={}\n".format(self.dist_config.path("zeppelin")))
            f.write(
                'export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n'.format(
                    spark_driver_mem, spark_executor_mem
                )
            )
            f.write("export ZEPPELIN_LOG_DIR={}\n".format(self.dist_config.path("zeppelin_logs")))
            f.write('export ZEPPELIN_MEM="-Xms128m -Xmx1024m -XX:MaxPermSize=512m"\n')
            f.write("export ZEPPELIN_NOTEBOOK_DIR={}\n".format(self.dist_config.path("zeppelin_notebooks")))
            f.write("export SPARK_HOME={}\n".format(spark_home))
            f.write(
                'export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n'.format(
                    spark_driver_mem, spark_executor_mem
                )
            )
            f.write("export HADOOP_CONF_DIR={}\n".format(hadoop_conf_dir))
            f.write("export PYTHONPATH={s}/python:{s}/python/lib/py4j-0.8.2.1-src.zip\n".format(s=spark_home))
            f.write("export MASTER={}\n".format(spark_exe_mode))

        # User needs write access to zepp's conf to write interpreter.json
        # on server start. chown the whole conf dir, though we could probably
        # touch that file and chown it, leaving the rest owned as root:root.
        # TODO: weigh implications of have zepp's conf dir owned by non-root.
        cmd = "chown -R ubuntu:hadoop {}".format(self.dist_config.path("zeppelin_conf"))
        call(cmd.split())
Пример #34
0
    def configure_yarn_mode(self):
        # put the spark jar in hdfs
        spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format(
            self.dist_config.path('spark')))[0]
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p',
                     '/user/ubuntu/share/lib')
        try:
            utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar,
                         '/user/ubuntu/share/lib/spark-assembly.jar')
        except CalledProcessError:
            pass  # jar already in HDFS from another Spark

        with utils.environment_edit_in_place('/etc/environment') as env:
            env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar"

        # create hdfs storage space for history server
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p',
                     self.dist_config.path('spark_events'))
        utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop',
                     self.dist_config.path('spark_events'))

        # create hdfs storage space for spark-bench
        utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p',
                     '/user/ubuntu/spark-bench')
        utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop',
                     '/user/ubuntu/spark-bench')

        # update spark-defaults
        spark_conf = self.dist_config.path(
            'spark_conf') / 'spark-defaults.conf'
        etc_env = utils.read_etc_env()
        hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '')
        utils.re_edit_in_place(spark_conf, {
            r'.*spark.master .*':
            'spark.master {}'.format(self.get_master()),
            r'.*spark.eventLog.enabled .*':
            'spark.eventLog.enabled true',
            r'.*spark.eventLog.dir .*':
            'spark.eventLog.dir hdfs://{}'.format(
                self.dist_config.path('spark_events')),
            r'.*spark.driver.extraClassPath .*':
            'spark.driver.extraClassPath {}'.format(hadoop_extra_classpath),
        },
                               append_non_matches=True)

        unitdata.kv().set('hdfs.available', True)
        unitdata.kv().flush(True)
Пример #35
0
    def disable_yarn_mode(self):
        # put the spark jar in hdfs
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['SPARK_JAR'] = glob('{}/lib/spark-assembly-*.jar'.format(
                self.dist_config.path('spark')))[0]

        # update spark-defaults
        spark_conf = self.dist_config.path(
            'spark_conf') / 'spark-defaults.conf'
        utils.re_edit_in_place(spark_conf, {
            r'.*spark.master .*':
            'spark.master {}'.format(self.get_master()),
        },
                               append_non_matches=True)

        unitdata.kv().set('hdfs.available', False)
        unitdata.kv().flush(True)
Пример #36
0
    def disable_yarn_mode(self):
        # put the spark jar in hdfs
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['SPARK_JAR'] = glob('{}/lib/spark-assembly-*.jar'.format(
                                  self.dist_config.path('spark')))[0]

        # update spark-defaults
        spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf'
        utils.re_edit_in_place(spark_conf, {
            r'.*spark.master .*': 'spark.master {}'.format(self.get_master()),
            r'.*spark.eventLog.enabled .*': 'spark.eventLog.enabled true',
            r'.*spark.eventLog.dir .*': '# spark.eventLog.dir hdfs:///user/ubuntu/directory',
            r'.*spark.driver.extraClassPath .*': '# spark.driver.extraClassPath none',
        }, append_non_matches=True)

        unitdata.kv().set('hdfs.available', False)
        unitdata.kv().flush(True)
    def configure_zeppelin(self):
        '''
        Configure zeppelin environment for all users
        '''
        zeppelin_bin = self.dist_config.path('zeppelin') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if zeppelin_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], zeppelin_bin])
            env['ZEPPELIN_CONF_DIR'] = self.dist_config.path('zeppelin_conf')

        zeppelin_site = self.dist_config.path('zeppelin_conf') / 'zeppelin-site.xml'
        with utils.xmlpropmap_edit_in_place(zeppelin_site) as xml:
            xml['zeppelin.server.port'] = self.dist_config.port('zeppelin')
            xml['zeppelin.notebook.dir'] = self.dist_config.path('zeppelin_notebooks')

        etc_env = utils.read_etc_env()
        hadoop_conf_dir = etc_env.get('HADOOP_CONF_DIR', '/etc/hadoop/conf')
        hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '')
        spark_home = etc_env.get('SPARK_HOME', '/usr/lib/spark')
        spark_driver_mem = etc_env.get('SPARK_DRIVER_MEMORY', '1g')
        spark_exe_mode = os.environ.get('MASTER', 'yarn-client')
        spark_executor_mem = etc_env.get('SPARK_EXECUTOR_MEMORY', '1g')
        zeppelin_env = self.dist_config.path('zeppelin_conf') / 'zeppelin-env.sh'
        with open(zeppelin_env, "a") as f:
            f.write('export ZEPPELIN_CLASSPATH_OVERRIDES={}\n'.format(hadoop_extra_classpath))
            f.write('export ZEPPELIN_HOME={}\n'.format(self.dist_config.path('zeppelin')))
            f.write('export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n'.format(
                spark_driver_mem,
                spark_executor_mem))
            f.write('export ZEPPELIN_LOG_DIR={}\n'.format(self.dist_config.path('zeppelin_logs')))
            f.write('export ZEPPELIN_MEM="-Xms128m -Xmx1024m -XX:MaxPermSize=512m"\n')
            f.write('export ZEPPELIN_NOTEBOOK_DIR={}\n'.format(self.dist_config.path('zeppelin_notebooks')))
            f.write('export SPARK_HOME={}\n'.format(spark_home))
            f.write('export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n'.format(
                spark_driver_mem,
                spark_executor_mem))
            f.write('export HADOOP_CONF_DIR={}\n'.format(hadoop_conf_dir))
            f.write('export PYTHONPATH={s}/python:{s}/python/lib/py4j-0.8.2.1-src.zip\n'.format(s=spark_home))
            f.write('export MASTER={}\n'.format(spark_exe_mode))

        # User needs write access to zepp's conf to write interpreter.json
        # on server start. chown the whole conf dir, though we could probably
        # touch that file and chown it, leaving the rest owned as root:root.
        # TODO: weigh implications of have zepp's conf dir owned by non-root.
        cmd = "chown -R ubuntu:hadoop {}".format(self.dist_config.path('zeppelin_conf'))
        call(cmd.split())
 def configure_flume_env(self, flume_hdfs_info_dict):
     config = hookenv.config()            
     templating.render(
         source='flume.conf.j2',
         target=self.dist_config.path('flume_conf') / 'flume.conf',
         context={'dist_config': self.dist_config, 'config': config, 'flume_hdfs': flume_hdfs_info_dict})
     
     flume_bin = self.dist_config.path('flume') / 'bin'
     java_symlink = check_output(["readlink", "-f", "/usr/bin/java"]).decode('utf8')
     java_home = re.sub('/bin/java', '', java_symlink).rstrip()
     with utils.environment_edit_in_place('/etc/environment') as env:
         if flume_bin not in env['PATH']:
             env['PATH'] = ':'.join([env['PATH'], flume_bin])
         env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf')
         env['FLUME_CLASSPATH'] = self.dist_config.path('flume') / 'lib'
         env['FLUME_HOME'] = self.dist_config.path('flume')
         env['JAVA_HOME'] = java_home
Пример #39
0
def install_mahout():
    hookenv.status_set('maintenance', 'installing mahout')
    bigtop = Bigtop()
    bigtop.render_site_yaml(
        roles=[
            'mahout-client',
        ],
    )
    bigtop.trigger_puppet()
    with utils.environment_edit_in_place('/etc/environment') as env:
        env['MAHOUT_HOME'] = '/usr/lib/mahout'

    set_state('mahout.installed')
    hookenv.status_set('active', 'ready')
    # set app version string for juju status output
    mahout_version = get_package_version('mahout') or 'unknown'
    hookenv.application_version_set(mahout_version)
Пример #40
0
def install_giraph(giraph):
    """Install giraph when prerequisite states are present."""
    hookenv.status_set('maintenance', 'installing giraph')
    bigtop = Bigtop()
    bigtop.render_site_yaml(
        roles=[
            'giraph-client',
        ],
    )
    bigtop.trigger_puppet()

    # Put down the -doc subpackage so we get giraph-examples
    fetch.apt_install('giraph-doc')

    giraph_home = Path('/usr/lib/giraph')
    giraph_docdir = Path('/usr/share/doc/giraph')
    giraph_libdir = Path(giraph_home / 'lib')
    giraph_examples = glob('{}/giraph-examples-*.jar'.format(giraph_docdir))

    # Gather a list of all the giraph jars (needed for -libjars)
    giraph_jars = giraph_examples
    giraph_jars.extend(get_good_jars(giraph_home, prefix=True))
    giraph_jars.extend(get_good_jars(giraph_libdir, prefix=True))

    # Update environment with appropriate giraph bits. HADOOP_CLASSPATH can
    # use wildcards (and it should for readability), but GIRAPH_JARS, which
    # is intended to be used as 'hadoop jar -libjars $GIRAPH_JARS', needs to
    # be a comma-separate list of jars.
    with utils.environment_edit_in_place('/etc/environment') as env:
        cur_cp = env['HADOOP_CLASSPATH'] if 'HADOOP_CLASSPATH' in env else ""
        env['GIRAPH_HOME'] = giraph_home
        env['HADOOP_CLASSPATH'] = "{examples}/*:{home}/*:{libs}/*:{cp}".format(
            examples=giraph_docdir,
            home=giraph_home,
            libs=giraph_libdir,
            cp=cur_cp
        )
        env['GIRAPH_JARS'] = ','.join(j for j in giraph_jars)

    set_state('giraph.installed')
    report_status()
    # set app version string for juju status output
    giraph_version = get_package_version('giraph') or 'unknown'
    hookenv.application_version_set(giraph_version)
Пример #41
0
    def configure_flume(self):
        config = hookenv.config()        
        templating.render(
            source='flume.conf.j2',
            target=self.dist_config.path('flume_conf') / 'flume.conf',
            context={'dist_config': self.dist_config, 'config': config})

        flume_bin = self.dist_config.path('flume') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if flume_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], flume_bin])
            env['FLUME_CONF_DIR'] = self.dist_config.path('flume_conf')
            env['FLUME_CLASSPATH'] = self.dist_config.path('flume') / 'lib'
            env['FLUME_HOME'] = self.dist_config.path('flume')

        # flume_env = self.dist_config.path('flume_conf') / 'flume-env.sh'
        # utils.re_edit_in_place(flume_env, {
        # })
        utils.run_as('flume', 'hdfs', 'dfs', '-mkdir', '-p', '/user/flume')
Пример #42
0
    def setup_hue(self):
        hue_bin = self.dist_config.path('hue') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if hue_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], hue_bin])
            env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin'
            env['GOBBLIN_WORK_DIR'] = self.dist_config.path('outputdir')
            hadoop_conf = env['HADOOP_CONF_DIR'] + '/core-site.xml'
            yarn_conf = env['HADOOP_CONF_DIR'] + '/yarn-site.xml'
            mapred_conf = env['HADOOP_CONF_DIR'] + '/mapred-site.xml'

        with utils.xmlpropmap_edit_in_place(hadoop_conf) as props:
            hdfs_endpoint = props['fs.defaultFS']

        with utils.xmlpropmap_edit_in_place(yarn_conf) as props:
            yarn_log_url = props['yarn.log.server.url'] # 19888
            yarn_resmgr = props['yarn.resourcemanager.address'] # 8032

        with utils.xmlpropmap_edit_in_place(mapred_conf) as props:
            mapred_jobhistory = props['mapreduce.jobhistory.address'] # 10020

        default_conf = self.dist_config.path('hue') / 'desktop/conf'
        hue_conf = self.dist_config.path('hue_conf')
        hue_conf.rmtree_p()
        default_conf.copytree(hue_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        default_conf.rmtree_p()
        hue_conf.symlink(default_conf)

        hdfs_fulluri = hdfs_endpoint.split('/')[2]
        hdfs_hostname = hdfs_fulluri.split(':')[0]

        hue_config = ''.join((self.dist_config.path('hue'), '/desktop/conf/hue.ini'))
        hue_port = self.dist_config.port('hue_web')
        utils.re_edit_in_place(hue_config, {
            r'http_port=8888': 'http_port=%s' % hue_port,
            r'fs_defaultfs=hdfs://localhost:8020': 'fs_defaults=%s' % hdfs_endpoint,
            r'## resourcemanager_host=localhost': 'resourcemanager_host=%s' % yarn_resmgr.split(':')[0],
            r'## resourcemanager_port=8032': 'resourcemanager_port=%s' % yarn_resmgr.split(':')[1],
            r'## webhdfs_url=http://localhost:50070/webhdfs/v1': 'webhdfs_url=http://%s:50070/webhdfs/v1' % hdfs_hostname,
            r'## history_server_api_url=http://localhost:19888': 'history_server_api_url=%s' % yarn_log_url.split('/')[0],
            r'## resourcemanager_api_url=http://localhost:8088': 'resourcemanager_api_url=http://%s:8088' % yarn_resmgr.split(':')[0]
            })
Пример #43
0
    def setup_hue(self, namenodes, resourcemanagers, hdfs_port, yarn_port):
        hookenv.status_set('maintenance', 'Setting up Hue')
        hue_bin = self.dist_config.path('hue') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if hue_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], hue_bin])
            env['HADOOP_BIN_DIR'] = env['HADOOP_HOME'] + '/bin'
            env['GOBBLIN_WORK_DIR'] = self.dist_config.path('outputdir')
            yarn_conf = env['HADOOP_CONF_DIR'] + '/yarn-site.xml'


        with utils.xmlpropmap_edit_in_place(yarn_conf) as props:
            yarn_log_url = props['yarn.log.server.url'] # 19888
            yarn_resmgr = props['yarn.resourcemanager.address'] # 8032

        default_conf = self.dist_config.path('hue') / 'desktop/conf'
        hue_conf = self.dist_config.path('hue_conf')

        if os.path.islink('/usr/lib/hue/desktop/conf'):
            return
        else:
            hue_conf.rmtree_p()
            default_conf.copytree(hue_conf)
            # Now remove the conf included in the tarball and symlink our real conf
            default_conf.rmtree_p()
            hue_conf.symlink(default_conf)

        hue_port = self.dist_config.port('hue_web')

        # Fix following for HA: http://docs.hortonworks.com/HDPDocuments/HDP2/HDP-2.3.0/bk_hadoop-ha/content/ha-nn-deploy-hue.html
        hookenv.log("Not currently supporting HA, FIX: namenodes are: " + str(namenodes) + " resmanagers: " + str(resourcemanagers))
        utils.re_edit_in_place(self.hue_config, {
            r'http_port=8888': 'http_port={}' % hue_port,
            r'fs_defaultfs=hdfs://localhost:8020': 'fs_defaultfs={}:{}'.format(namenodes[0], hdfs_port),
            r'.*resourcemanager_host=localhost': 'resourcemanager_host={}'.format(resourcemanagers[0]),
            r'.*resourcemanager_port=8032': 'resourcemanager_port={}'.format(yarn_port),
            r'.*webhdfs_url=http://localhost:50070/webhdfs/v1': 'webhdfs_url=http://{}:50070/webhdfs/v1'.format(namenodes[0]),
            r'.*history_server_api_url=http://localhost:19888': 'history_server_api_url={}'.format(yarn_log_url.split('/')[0]),
            r'.*resourcemanager_api_url=http://localhost:8088': 'resourcemanager_api_url=http://{}:8088'.format(yarn_resmgr.split(':')[0]),
            r'.*secret_key=.*': 'secret_key={}'.format(uuid.uuid4())
            })

        self.update_apps()
Пример #44
0
    def configure_livy(self):
        """
        Configure livy environment for all users
        """
        livy_bin = self.dist_config.path("livy") / "bin"
        with utils.environment_edit_in_place("/etc/environment") as env:
            if livy_bin not in env["PATH"]:
                env["PATH"] = ":".join([env["PATH"], livy_bin])
            hadoop_cp = "/etc/hadoop/conf:/usr/lib/hadoop/share/hadoop/common/lib/*:/usr/lib/hadoop/share/hadoop/common/*\
:/usr/lib/hadoop/share/hadoop/hdfs:/usr/lib/hadoop/share/hadoop/hdfs/lib/*\
:/usr/lib/hadoop/share/hadoop/hdfs/*:/usr/lib/hadoop/share/hadoop/yarn/lib/*\
:/usr/lib/hadoop/share/hadoop/yarn/*:/usr/lib/hadoop/share/hadoop/mapreduce/lib/*\
:/usr/lib/hadoop/share/hadoop/mapreduce/*:/usr/lib/hadoop/contrib/capacity-scheduler/*.jar"
            env["CLASSPATH"] = hadoop_cp

        cmd = "chown -R hue:hadoop {}".format(self.dist_config.path("livy"))
        call(cmd.split())
        cmd = "chown -R hue:hadoop {}".format(self.dist_config.path("livy_conf"))
        call(cmd.split())
Пример #45
0
    def configure_hadoop(self):
        java_home = Path(unitdata.kv().get('java.home'))
        java_bin = java_home / 'bin'
        hadoop_home = self.dist_config.path('hadoop')
        hadoop_bin = hadoop_home / 'bin'
        hadoop_sbin = hadoop_home / 'sbin'

        # If we have hadoop-addons (like lzo), set those in the environment
        hadoop_extra_classpath = []
        if 'lzo' in self.resources:
            hadoop_extra_classpath.extend(
                hadoop_home.walkfiles('hadoop-lzo-*.jar'))
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['JAVA_HOME'] = java_home
            if java_bin not in env['PATH']:
                env['PATH'] = ':'.join([java_bin, env['PATH']
                                        ])  # ensure that correct java is used
            if hadoop_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], hadoop_bin])
            if hadoop_sbin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], hadoop_sbin])
            if hadoop_extra_classpath:
                env['HADOOP_EXTRA_CLASSPATH'] = ':'.join(
                    hadoop_extra_classpath)
            env['HADOOP_LIBEXEC_DIR'] = hadoop_home / 'libexec'
            env['HADOOP_INSTALL'] = hadoop_home
            env['HADOOP_HOME'] = hadoop_home
            env['HADOOP_COMMON_HOME'] = hadoop_home
            env['HADOOP_HDFS_HOME'] = hadoop_home
            env['HADOOP_MAPRED_HOME'] = hadoop_home
            env['HADOOP_MAPRED_LOG_DIR'] = self.dist_config.path(
                'mapred_log_dir')
            env['HADOOP_YARN_HOME'] = hadoop_home
            env['HADOOP_CONF_DIR'] = self.dist_config.path('hadoop_conf')
            env['YARN_LOG_DIR'] = self.dist_config.path('yarn_log_dir')
            env['HADOOP_LOG_DIR'] = self.dist_config.path('hdfs_log_dir')

        hadoop_env = self.dist_config.path('hadoop_conf') / 'hadoop-env.sh'
        utils.re_edit_in_place(
            hadoop_env, {
                r'export JAVA_HOME *=.*': 'export JAVA_HOME=%s' % java_home,
            })
Пример #46
0
def install_giraph(giraph):
    """Install giraph when prerequisite states are present."""
    hookenv.status_set('maintenance', 'installing giraph')
    bigtop = Bigtop()
    bigtop.render_site_yaml(roles=[
        'giraph-client',
    ], )
    bigtop.trigger_puppet()

    # Put down the -doc subpackage so we get giraph-examples
    fetch.apt_install('giraph-doc')

    giraph_home = Path('/usr/lib/giraph')
    giraph_docdir = Path('/usr/share/doc/giraph')
    giraph_libdir = Path(giraph_home / 'lib')
    giraph_examples = glob('{}/giraph-examples-*.jar'.format(giraph_docdir))

    # Gather a list of all the giraph jars (needed for -libjars)
    giraph_jars = giraph_examples
    giraph_jars.extend(get_good_jars(giraph_home, prefix=True))
    giraph_jars.extend(get_good_jars(giraph_libdir, prefix=True))

    # Update environment with appropriate giraph bits. HADOOP_CLASSPATH can
    # use wildcards (and it should for readability), but GIRAPH_JARS, which
    # is intended to be used as 'hadoop jar -libjars $GIRAPH_JARS', needs to
    # be a comma-separate list of jars.
    with utils.environment_edit_in_place('/etc/environment') as env:
        cur_cp = env['HADOOP_CLASSPATH'] if 'HADOOP_CLASSPATH' in env else ""
        env['GIRAPH_HOME'] = giraph_home
        env['HADOOP_CLASSPATH'] = "{examples}/*:{home}/*:{libs}/*:{cp}".format(
            examples=giraph_docdir,
            home=giraph_home,
            libs=giraph_libdir,
            cp=cur_cp)
        env['GIRAPH_JARS'] = ','.join(j for j in giraph_jars)

    set_state('giraph.installed')
    report_status()
    # set app version string for juju status output
    giraph_version = get_package_version('giraph') or 'unknown'
    hookenv.application_version_set(giraph_version)
Пример #47
0
    def setup_pig(self):
        '''
        copy the default configuration files to pig_conf property
        defined in dist.yaml
        '''
        default_conf = self.dist_config.path('pig') / 'conf'
        pig_conf = self.dist_config.path('pig_conf')
        pig_conf.rmtree_p()
        default_conf.copytree(pig_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        default_conf.rmtree_p()
        pig_conf.symlink(default_conf)

        # Configure immutable bits
        pig_bin = self.dist_config.path('pig') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if pig_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], pig_bin])
            env['PIG_CLASSPATH'] = env['HADOOP_CONF_DIR']
            env['PIG_CONF_DIR'] = self.dist_config.path('pig_conf')
            env['PIG_HOME'] = self.dist_config.path('pig')
Пример #48
0
def installoracle():
    hookenv.log('Installing Oracle JDK')
    java_major = '8'
    java_minor = '73'
    tarname = 'server-jre-{}u{}-linux-x64.tar.gz'.format(
        java_major, java_minor)
    dirname = 'jdk1.{}.0_{}'.format(java_major, java_minor)
    destdir = "/opt/java/{}".format(dirname)
    if not os.path.isdir(destdir):
        tfile = tarfile.open('{}/files/{}'.format(charm_dir(), tarname), 'r')
        # Important to note that the following extraction is
        # UNSAFE since .tar.gz archive could contain
        # relative path like ../../ and overwrite other dirs
        filesdir = '{}/files/'.format(charm_dir())
        extractdir = '{}/{}'.format(filesdir, dirname)
        tfile.extractall(filesdir)
        mergecopytree(extractdir, destdir)
        # Set defaults
        subprocess.check_output([
            'update-alternatives', '--install', '/usr/bin/java', 'java',
            '{}/jre/bin/java'.format(destdir), '2000'
        ])
        subprocess.check_output([
            'update-alternatives', '--install', '/usr/bin/javac', 'javac',
            '{}/bin/javac'.format(destdir), '2000'
        ])
        # set env vars
        with utils.environment_edit_in_place('/etc/environment') as env:
            # ensure that correct java is used
            env['JAVA_HOME'] = destdir
            env['J2SDKDIR'] = destdir
            env['J2REDIR'] = '{}/jre'.format(destdir)
            env['DERBY_HOME'] = '{}/db'.format(destdir)
            if destdir not in env['PATH']:
                env['PATH'] = ':'.join([
                    '{}/bin'.format(env['JAVA_HOME']),
                    '{}/bin'.format(env['J2REDIR']),
                    '{}/bin'.format(env['DERBY_HOME']),
                    env['PATH'],
                ])
Пример #49
0
    def initial_config(self):
        """Do one-time Pig configuration.

        Copy the default configuration files to the pig_conf dir from dist.yaml
        and adjust system environment.
        """
        default_conf = self.dist_config.path('pig') / 'conf'
        pig_conf = self.dist_config.path('pig_conf')
        pig_conf.rmtree_p()
        default_conf.copytree(pig_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        default_conf.rmtree_p()
        pig_conf.symlink(default_conf)

        # Configure immutable bits
        pig_bin = self.dist_config.path('pig') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if pig_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], pig_bin])
            env['PIG_CONF_DIR'] = self.dist_config.path('pig_conf')
            env['PIG_HOME'] = self.dist_config.path('pig')
            env['JAVA_HOME'] = Path(unitdata.kv().get('java.home'))
Пример #50
0
    def initial_config(self):
        """Do one-time Pig configuration.

        Copy the default configuration files to the pig_conf dir from dist.yaml
        and adjust system environment.
        """
        default_conf = self.dist_config.path('pig') / 'conf'
        pig_conf = self.dist_config.path('pig_conf')
        pig_conf.rmtree_p()
        default_conf.copytree(pig_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        default_conf.rmtree_p()
        pig_conf.symlink(default_conf)

        # Configure immutable bits
        pig_bin = self.dist_config.path('pig') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if pig_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], pig_bin])
            env['PIG_CONF_DIR'] = self.dist_config.path('pig_conf')
            env['PIG_HOME'] = self.dist_config.path('pig')
            env['JAVA_HOME'] = Path(unitdata.kv().get('java.home'))
Пример #51
0
def install_go():
    version = hookenv.config().get('version')
    if not version:
        status_set('blocked', 'Provide a Go version')
        return
    try:
        request = requests.get(version)
        if not request.status_code == 200:
            return
        file_path = '/tmp/' + version.split('/')[-1]
        with open(file_path, 'wb') as f:
            f.write(request.content)
    except requests.exceptions.RequestException as e:
        hookenv.log(e)
        return
    tar = tarfile.open(file_path, 'r:gz')
    tar.extractall('/tmp')
    tar.close()
    if not os.path.exists('/home/ubuntu/go'):
        shutil.move('/tmp/go', '/home/ubuntu')
    os.makedirs('/home/ubuntu/code/go/bin')
    chown_recursive('/home/ubuntu/go', 'ubuntu', 'ubuntu')
    chown_recursive('/home/ubuntu/code', 'ubuntu', 'ubuntu')
    with utils.environment_edit_in_place('/etc/environment') as env:
        env['GOROOT'] = '/home/ubuntu/go'
        env['GOPATH'] = '/home/ubuntu/code/go'
        env['PATH'] = env[
            'PATH'] + ':/home/ubuntu/go/bin:/home/ubuntu/code/go/bin'

    # Install package manager
    r = requests.get(
        'https://raw.githubusercontent.com/pote/gpm/v1.4.0/bin/gpm')
    with open('/usr/local/bin/gpm', 'wb') as f:
        f.write(r.content)
    os.chmod("/usr/local/bin/gpm", 0o755)

    status_set('active', 'go installed')
    set_state('go.installed')
Пример #52
0
    def setup_flink(self):
        '''
        copy the default configuration files to flink_conf property
        defined in dist.yaml
        '''
        default_conf = self.dist_config.path('flink') / 'conf'
        flink_conf = self.dist_config.path('flink_conf')
        if os.path.islink(default_conf):
            return
        flink_conf.rmtree_p()
        default_conf.copytree(flink_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        default_conf.rmtree_p()
        flink_conf.symlink(default_conf)

        # Configure immutable bits
        flink_bin = self.dist_config.path('flink') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if flink_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], flink_bin])
            env['FLINK_CLASSPATH'] = env['HADOOP_CONF_DIR']
            env['FLINK_CONF_DIR'] = self.dist_config.path('flink_conf')
            env['FLINK_HOME'] = self.dist_config.path('flink')
Пример #53
0
    def disable_yarn_mode(self):
        # put the spark jar in hdfs
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['SPARK_JAR'] = glob('{}/lib/spark-assembly-*.jar'.format(
                self.dist_config.path('spark')))[0]

        # update spark-defaults
        spark_conf = self.dist_config.path(
            'spark_conf') / 'spark-defaults.conf'
        utils.re_edit_in_place(spark_conf, {
            r'.*spark.master .*':
            'spark.master {}'.format(self.get_master()),
            r'.*spark.eventLog.enabled .*':
            'spark.eventLog.enabled true',
            r'.*spark.eventLog.dir .*':
            '# spark.eventLog.dir hdfs:///user/ubuntu/directory',
            r'.*spark.driver.extraClassPath .*':
            '# spark.driver.extraClassPath none',
        },
                               append_non_matches=True)

        unitdata.kv().set('hdfs.available', False)
        unitdata.kv().flush(True)
Пример #54
0
    def setup_hive_config(self):
        '''
        copy the default configuration files to hive_conf property
        defined in dist.yaml
        '''
        default_conf = self.dist_config.path('hive') / 'conf'
        hive_conf = self.dist_config.path('hive_conf')
        hive_conf.rmtree_p()
        default_conf.copytree(hive_conf)

        # Configure immutable bits
        hive_bin = self.dist_config.path('hive') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if hive_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], hive_bin])
            env['HIVE_CONF_DIR'] = self.dist_config.path('hive_conf')

        hive_env = self.dist_config.path('hive_conf') / 'hive-env.sh'
        if not hive_env.exists():
            (self.dist_config.path('hive_conf') /
             'hive-env.sh.template').copy(hive_env)

        hive_site = self.dist_config.path('hive_conf') / 'hive-site.xml'
        if not hive_site.exists():
            (self.dist_config.path('hive_conf') /
             'hive-default.xml.template').copy(hive_site)
        with utils.xmlpropmap_edit_in_place(hive_site) as props:
            # TODO (kwm): we should be able to export java.io.tmpdir so these 4 arent needed
            props['hive.exec.local.scratchdir'] = "/tmp/hive"
            props['hive.downloaded.resources.dir'] = "/tmp/hive_resources"
            props['hive.querylog.location'] = "/tmp/hive"
            props['hive.server2.logging.operation.log.location'] = "/tmp/hive"
            ####

        # create hdfs storage space
        utils.run_as('hive', 'hdfs', 'dfs', '-mkdir', '-p',
                     '/user/hive/warehouse')
Пример #55
0
def configure_tomcat():
    '''Configures Tomcat by setting environment variable and adding a user.'''
    status_set('maintenance', 'Configuring Tomcat...')

    # Set environment variable CATALINA_HOME.
    with utils.environment_edit_in_place('/etc/environment') as env:
        env['CATALINA_HOME'] = TOMCAT_DIR

    # Create a file where the process id of Tomcat can be stored. This makes
    # it possible to check if Tomcat is running.
    with open(TOMCAT_DIR + "/bin/setenv.sh", "a+") as setenv:
        setenv.write('CATALINA_PID="$CATALINA_BASE/bin/catalina.pid"')

    # Creates an admin user that has access to the manager-gui.
    admin_username = config()["admin_username"]
    admin_password = config()["admin_password"]

    context = {
        'admin_username': admin_username,
        'admin_password': admin_password
    }
    render('tomcat-users.xml', TOMCAT_DIR + '/conf/tomcat-users.xml', context)

    set_state('layer-tomcat.configured')
Пример #56
0
    def configure(self, available_hosts, zk_units, peers):
        """
        This is the core logic of setting up spark.

        Two flags are needed:

          * Namenode exists aka HDFS is ready
          * Resource manager exists aka YARN is ready

        both flags are infered from the available hosts.

        :param dict available_hosts: Hosts that Spark should know about.
        """
        # Bootstrap spark
        if not unitdata.kv().get('spark.bootstrapped', False):
            self.setup()
            unitdata.kv().set('spark.bootstrapped', True)

        # Set KV based on connected applications
        unitdata.kv().set('zookeeper.units', zk_units)
        unitdata.kv().set('sparkpeer.units', peers)
        unitdata.kv().flush(True)

        # Get our config ready
        dc = self.dist_config
        events_log_dir = 'file://{}'.format(dc.path('spark_events'))
        mode = hookenv.config()['spark_execution_mode']
        master_ip = utils.resolve_private_address(available_hosts['spark-master'])
        master_url = self.get_master_url(master_ip)

        # Setup hosts dict
        hosts = {
            'spark': master_ip,
        }
        if 'namenode' in available_hosts:
            hosts['namenode'] = available_hosts['namenode']
            events_log_dir = self.setup_hdfs_logs()

        if 'resourcemanager' in available_hosts:
            hosts['resourcemanager'] = available_hosts['resourcemanager']

        # Setup roles dict. We always include the history server and client.
        # Determine other roles based on our execution mode.
        roles = ['spark-history-server', 'spark-client']
        if mode == 'standalone':
            roles.append('spark-master')
            roles.append('spark-worker')
        elif mode.startswith('yarn'):
            roles.append('spark-on-yarn')
            roles.append('spark-yarn-slave')

        # Setup overrides dict
        override = {
            'spark::common::master_url': master_url,
            'spark::common::event_log_dir': events_log_dir,
            'spark::common::history_log_dir': events_log_dir,
        }
        if zk_units:
            zks = []
            for unit in zk_units:
                ip = utils.resolve_private_address(unit['host'])
                zks.append("%s:%s" % (ip, unit['port']))

            zk_connect = ",".join(zks)
            override['spark::common::zookeeper_connection_string'] = zk_connect
        else:
            override['spark::common::zookeeper_connection_string'] = None

        # Create our site.yaml and trigger puppet
        bigtop = Bigtop()
        bigtop.render_site_yaml(hosts, roles, override)
        bigtop.trigger_puppet()

        # Do this after our puppet bits in case puppet overrides needed perms
        if 'namenode' not in available_hosts:
            # Local event dir (not in HDFS) needs to be 777 so non-spark
            # users can write job history there. It needs to be g+s so
            # all entries will be readable by spark (in the spark group).
            # It needs to be +t so users cannot remove files they don't own.
            dc.path('spark_events').chmod(0o3777)

        self.patch_worker_master_url(master_ip, master_url)

        # handle tuning options that may be set as percentages
        driver_mem = '1g'
        req_driver_mem = hookenv.config()['driver_memory']
        executor_mem = '1g'
        req_executor_mem = hookenv.config()['executor_memory']
        if req_driver_mem.endswith('%'):
            if mode == 'standalone' or mode.startswith('local'):
                mem_mb = host.get_total_ram() / 1024 / 1024
                req_percentage = float(req_driver_mem.strip('%')) / 100
                driver_mem = str(int(mem_mb * req_percentage)) + 'm'
            else:
                hookenv.log("driver_memory percentage in non-local mode. Using 1g default.",
                            level=None)
        else:
            driver_mem = req_driver_mem

        if req_executor_mem.endswith('%'):
            if mode == 'standalone' or mode.startswith('local'):
                mem_mb = host.get_total_ram() / 1024 / 1024
                req_percentage = float(req_executor_mem.strip('%')) / 100
                executor_mem = str(int(mem_mb * req_percentage)) + 'm'
            else:
                hookenv.log("executor_memory percentage in non-local mode. Using 1g default.",
                            level=None)
        else:
            executor_mem = req_executor_mem

        spark_env = '/etc/spark/conf/spark-env.sh'
        utils.re_edit_in_place(spark_env, {
            r'.*SPARK_DRIVER_MEMORY.*': 'export SPARK_DRIVER_MEMORY={}'.format(driver_mem),
            r'.*SPARK_EXECUTOR_MEMORY.*': 'export SPARK_EXECUTOR_MEMORY={}'.format(executor_mem),
        }, append_non_matches=True)

        # Install SB (subsequent calls will reconfigure existing install)
        # SparkBench looks for the spark master in /etc/environment
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['MASTER'] = master_url
        self.install_benchmark()
Пример #57
0
    def configure(self, available_hosts, zk_units, peers, extra_libs):
        """
        This is the core logic of setting up spark.

        :param dict available_hosts: Hosts that Spark should know about.
        :param list zk_units: List of Zookeeper dicts with host/port info.
        :param list peers: List of Spark peer tuples (unit name, IP).
        :param list extra_libs: List of extra lib paths for driver/executors.
        """
        # Set KV based on connected applications
        unitdata.kv().set('zookeeper.units', zk_units)
        unitdata.kv().set('sparkpeer.units', peers)
        unitdata.kv().flush(True)

        # Get our config ready
        dc = self.dist_config
        mode = hookenv.config()['spark_execution_mode']
        master_ip = utils.resolve_private_address(
            available_hosts['spark-master'])
        master_url = self.get_master_url(master_ip)
        req_driver_mem = hookenv.config()['driver_memory']
        req_executor_mem = hookenv.config()['executor_memory']
        if mode.startswith('yarn'):
            spark_events = 'hdfs://{}'.format(dc.path('spark_events'))
        else:
            spark_events = 'file://{}'.format(dc.path('spark_events'))

        # handle tuning options that may be set as percentages
        driver_mem = '1g'
        executor_mem = '1g'
        if req_driver_mem.endswith('%'):
            if mode == 'standalone' or mode.startswith('local'):
                mem_mb = host.get_total_ram() / 1024 / 1024
                req_percentage = float(req_driver_mem.strip('%')) / 100
                driver_mem = str(int(mem_mb * req_percentage)) + 'm'
            else:
                hookenv.log(
                    "driver_memory percentage in non-local mode. "
                    "Using 1g default.",
                    level=hookenv.WARNING)
        else:
            driver_mem = req_driver_mem

        if req_executor_mem.endswith('%'):
            if mode == 'standalone' or mode.startswith('local'):
                mem_mb = host.get_total_ram() / 1024 / 1024
                req_percentage = float(req_executor_mem.strip('%')) / 100
                executor_mem = str(int(mem_mb * req_percentage)) + 'm'
            else:
                hookenv.log(
                    "executor_memory percentage in non-local mode. "
                    "Using 1g default.",
                    level=hookenv.WARNING)
        else:
            executor_mem = req_executor_mem

        # Some spark applications look for envars in /etc/environment
        with utils.environment_edit_in_place('/etc/environment') as env:
            env['MASTER'] = master_url
            env['SPARK_HOME'] = dc.path('spark_home')

        # Setup hosts dict
        hosts = {
            'spark': master_ip,
        }
        if 'namenode' in available_hosts:
            hosts['namenode'] = available_hosts['namenode']
        if 'resourcemanager' in available_hosts:
            hosts['resourcemanager'] = available_hosts['resourcemanager']

        # Setup roles dict. We always include the history server and client.
        # Determine other roles based on our execution mode.
        roles = ['spark-history-server', 'spark-client']
        if mode == 'standalone':
            roles.append('spark-master')
            roles.append('spark-worker')
        elif mode.startswith('yarn'):
            roles.append('spark-on-yarn')
            roles.append('spark-yarn-slave')

        # Setup overrides dict
        override = {
            'spark::common::master_url':
            master_url,
            'spark::common::event_log_dir':
            spark_events,
            'spark::common::history_log_dir':
            spark_events,
            'spark::common::extra_lib_dirs':
            ':'.join(extra_libs) if extra_libs else None,
            'spark::common::driver_mem':
            driver_mem,
            'spark::common::executor_mem':
            executor_mem,
        }
        if zk_units:
            zks = []
            for unit in zk_units:
                ip = utils.resolve_private_address(unit['host'])
                zks.append("%s:%s" % (ip, unit['port']))

            zk_connect = ",".join(zks)
            override['spark::common::zookeeper_connection_string'] = zk_connect
        else:
            override['spark::common::zookeeper_connection_string'] = None

        # Create our site.yaml and trigger puppet.
        # NB: during an upgrade, we configure the site.yaml, but do not
        # trigger puppet. The user must do that with the 'reinstall' action.
        bigtop = Bigtop()
        bigtop.render_site_yaml(hosts, roles, override)
        if unitdata.kv().get('spark.version.repo', False):
            hookenv.log(
                "An upgrade is available and the site.yaml has been "
                "configured. Run the 'reinstall' action to continue.",
                level=hookenv.INFO)
        else:
            bigtop.trigger_puppet()
            self.patch_worker_master_url(master_ip, master_url)

            # Packages don't create the event dir by default. Do it each time
            # spark is (re)installed to ensure location/perms are correct.
            self.configure_events_dir(mode)

        # Handle examples and Spark-Bench. Do this each time this method is
        # called in case we need to act on a new resource or user config.
        self.configure_examples()
        self.configure_sparkbench()
Пример #58
0
    def setup_kafka_config(self):
        '''
        copy the default configuration files to kafka_conf property
        defined in dist.yaml
        '''
        default_conf = self.dist_config.path('kafka') / 'config'
        kafka_conf = self.dist_config.path('kafka_conf')
        kafka_conf.rmtree_p()
        default_conf.copytree(kafka_conf)
        # Now remove the conf included in the tarball and symlink our real conf
        # dir. we've seen issues where kafka still looks for config in
        # KAFKA_HOME/config.
        default_conf.rmtree_p()
        kafka_conf.symlink(default_conf)

        # Similarly, we've seen issues where kafka wants to write to
        # KAFKA_HOME/logs regardless of the LOG_DIR, so make a symlink.
        default_logs = self.dist_config.path('kafka') / 'logs'
        kafka_logs = self.dist_config.path('kafka_app_logs')
        default_logs.rmtree_p()
        kafka_logs.symlink(default_logs)

        # Configure environment
        kafka_bin = self.dist_config.path('kafka') / 'bin'
        with utils.environment_edit_in_place('/etc/environment') as env:
            if kafka_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], kafka_bin])
            env['LOG_DIR'] = self.dist_config.path('kafka_app_logs')

        # Configure server.properties
        # NB: We set the advertised.host.name below to our short hostname
        # instead of our private ip so external (non-Juju) clients can connect
        # to kafka (admin will still have to expose kafka and ensure the
        # external client can resolve the short hostname to our public ip).
        short_host = get_ip_for_interface(hookenv.config(network_interface))
        if not short_host:
            short_host = hookenv.config().get('hostname')
        if not short_host:
            short_host = check_output(['hostname',
                                       '-s']).decode('utf8').strip()
        kafka_port = self.dist_config.port('kafka')
        kafka_server_conf = self.dist_config.path(
            'kafka_conf') / 'server.properties'
        service, unit_num = os.environ['JUJU_UNIT_NAME'].split('/', 1)
        utils.re_edit_in_place(
            kafka_server_conf, {
                r'^broker.id=.*':
                'broker.id=%s' % unit_num,
                r'^port=.*':
                'port=%s' % kafka_port,
                r'^log.dirs=.*':
                'log.dirs=%s' % self.dist_config.path('kafka_data_logs'),
                r'^#?advertised.host.name=.*':
                'advertised.host.name=%s' % short_host,
            })

        # Configure producer.properties
        # note: we set the broker list to whatever we advertise our broker to
        # be (advertised.host.name from above, which is our short hostname).
        kafka_producer_conf = self.dist_config.path(
            'kafka_conf') / 'producer.properties'
        utils.re_edit_in_place(
            kafka_producer_conf, {
                r'^#?metadata.broker.list=.*':
                'metadata.broker.list=%s:%s' % (short_host, kafka_port),
            })

        # Configure log properties
        kafka_log4j = self.dist_config.path('kafka_conf') / 'log4j.properties'
        utils.re_edit_in_place(
            kafka_log4j, {
                r'^kafka.logs.dir=.*':
                'kafka.logs.dir=%s' % self.dist_config.path('kafka_app_logs'),
            })

        # Configure init script
        template_name = 'upstart.conf'
        template_path = '/etc/init/kafka.conf'
        if host.init_is_systemd():
            template_name = 'systemd.conf'
            template_path = '/etc/systemd/system/kafka.service'

        templating.render(
            template_name,
            template_path,
            context={
                'kafka_conf': self.dist_config.path('kafka_conf'),
                'kafka_bin': '{}/bin'.format(self.dist_config.path('kafka'))
            },
        )
Пример #59
0
    def configure(self):
        '''
        Configure spark environment for all users
        '''
        spark_home = self.dist_config.path('spark')
        spark_bin = spark_home / 'bin'

        # handle tuning options that may be set as percentages
        driver_mem = '1g'
        req_driver_mem = hookenv.config()['driver_memory']
        executor_mem = '1g'
        req_executor_mem = hookenv.config()['executor_memory']
        if req_driver_mem.endswith('%'):
            if self.is_spark_local():
                mem_mb = host.get_total_ram() / 1024 / 1024
                req_percentage = float(req_driver_mem.strip('%')) / 100
                driver_mem = str(int(mem_mb * req_percentage)) + 'm'
            else:
                hookenv.log(
                    "driver_memory percentage in non-local mode. Using 1g default.",
                    level=None)
        else:
            driver_mem = req_driver_mem

        if req_executor_mem.endswith('%'):
            if self.is_spark_local():
                mem_mb = host.get_total_ram() / 1024 / 1024
                req_percentage = float(req_executor_mem.strip('%')) / 100
                executor_mem = str(int(mem_mb * req_percentage)) + 'm'
            else:
                hookenv.log(
                    "executor_memory percentage in non-local mode. Using 1g default.",
                    level=None)
        else:
            executor_mem = req_executor_mem

        # update environment variables
        with utils.environment_edit_in_place('/etc/environment') as env:
            if spark_bin not in env['PATH']:
                env['PATH'] = ':'.join([env['PATH'], spark_bin])
            env['MASTER'] = self.get_master()
            env['PYSPARK_DRIVER_PYTHON'] = "ipython"
            env['SPARK_CONF_DIR'] = self.dist_config.path('spark_conf')
            env['SPARK_DRIVER_MEMORY'] = driver_mem
            env['SPARK_EXECUTOR_MEMORY'] = executor_mem
            env['SPARK_HOME'] = spark_home
            env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar"

        # update spark config
        spark_conf = self.dist_config.path(
            'spark_conf') / 'spark-defaults.conf'
        utils.re_edit_in_place(
            spark_conf, {
                r'.*spark.master *.*':
                'spark.master {}'.format(self.get_master()),
                r'.*spark.eventLog.enabled *.*':
                'spark.eventLog.enabled true',
                r'.*spark.eventLog.dir *.*':
                'spark.eventLog.dir hdfs:///user/ubuntu/directory',
            })
        spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh'
        local_ip = utils.resolve_private_address(hookenv.unit_private_ip())
        utils.re_edit_in_place(
            spark_env, {
                r'.*SPARK_DRIVER_MEMORY.*':
                'SPARK_DRIVER_MEMORY={}'.format(driver_mem),
                r'.*SPARK_EXECUTOR_MEMORY.*':
                'SPARK_EXECUTOR_MEMORY={}'.format(executor_mem),
                r'.*SPARK_LOG_DIR.*':
                'SPARK_LOG_DIR={}'.format(self.dist_config.path('spark_logs')),
                r'.*SPARK_MASTER_IP.*':
                'SPARK_MASTER_IP={}'.format(local_ip),
                r'.*SPARK_WORKER_DIR.*':
                'SPARK_WORKER_DIR={}'.format(
                    self.dist_config.path('spark_work')),
            })

        # manage SparkBench
        install_sb = hookenv.config()['spark_bench_enabled']
        sb_dir = '/home/ubuntu/spark-bench'
        if install_sb:
            if utils.cpu_arch() == 'ppc64le':
                sb_url = hookenv.config()['spark_bench_ppc64le']
            else:
                # TODO: may need more arch cases (go with x86 sb for now)
                sb_url = hookenv.config()['spark_bench_x86_64']

            Path(sb_dir).rmtree_p()
            fetcher = ArchiveUrlFetchHandler()
            fetcher.install(sb_url, '/home/ubuntu')

            # #####
            # Handle glob if we use a .tgz that doesn't expand to sb_dir
            # sb_archive_dir = glob('/home/ubuntu/spark-bench-*')[0]
            # SparkBench expects to live in ~/spark-bench, so put it there
            # Path(sb_archive_dir).rename(sb_dir)
            # #####

            # comment out mem tunings (let them come from /etc/environment)
            sb_env = Path(sb_dir) / 'conf/env.sh'
            utils.re_edit_in_place(
                sb_env, {
                    r'^SPARK_DRIVER_MEMORY.*':
                    '# SPARK_DRIVER_MEMORY (use value from environment)',
                    r'^SPARK_EXECUTOR_MEMORY.*':
                    '# SPARK_EXECUTOR_MEMORY (use value from environment)',
                })
        else:
            Path(sb_dir).rmtree_p()