Exemplo n.º 1
    def _get_libjars_local_paths(self, s3_jar_dirs, local_jar_dir):
        """Returns a list of local jar paths downloaded from s3.

            s3_jar_dirs: S3 path from which we pull down JARs from.
            local_jar_dir: local path on every machine in the Qubole cluster
                to which jars are pulled down

            List of local file paths as a string, with each file name delimited
            comma (,), if the supplied s3_jar_dirs is valid. Otherwise, returns
            empty string.
        file_paths = []
        for s3_jar_dir in s3_jar_dirs:
            file_paths += s3_utils.list_s3_directory(s3_jar_dir)

        jar_names = [
            for file_path in file_paths if str(file_path).endswith('jar')
        filtered_jar_names = [
            jar_name for jar_name in jar_names
            if jar_name not in self.config.QUBOLE_JARS_BLACKLIST

        # dedup jar lists.
        filtered_jar_names = list(set(filtered_jar_names))

        final_jar_paths = [
            '%s/%s' % (local_jar_dir, jar_name)
            for jar_name in filtered_jar_names
        return ','.join(final_jar_paths)
Exemplo n.º 2
    def _get_libjars_local_paths(self, s3_jar_dirs, local_jar_dir):
        """Returns a list of local jar paths downloaded from s3.

            s3_jar_dirs: S3 path from which we pull down JARs from.
            local_jar_dir: local path on every machine in the Qubole cluster
                to which jars are pulled down

            List of local file paths as a string, with each file name delimited
            comma (,), if the supplied s3_jar_dirs is valid. Otherwise, returns
            empty string.
        file_paths = []
        for s3_jar_dir in s3_jar_dirs:
            file_paths += s3_utils.list_s3_directory(s3_jar_dir)

        jar_names = [
            for file_path in file_paths if str(file_path).endswith('jar')]
        filtered_jar_names = [
            for jar_name in jar_names if jar_name not in self.config.QUBOLE_JARS_BLACKLIST]

        # dedup jar lists.
        filtered_jar_names = list(set(filtered_jar_names))

        final_jar_paths = [
            '%s/%s' % (local_jar_dir, jar_name)
            for jar_name in filtered_jar_names]
        return ','.join(final_jar_paths)
Exemplo n.º 3
    def run_hadoop_job(self,
        """Run a Hadoop job in Qubole cluster.

        We assume extra_jars are stored on s3 and the path looks like:

        We fail the entire command if pulling the JARs down from s3 fails,
        so we use "&&" to connect shell commands.
        jobconf_args = jobconf_args if jobconf_args else {}
        extra_args = extra_args if extra_args else []
        extra_jars = extra_jars if extra_jars else []

        # The place where all jars are stored in s3.
        s3_jar_dirs = self.config.USER_LIBJAR_DIRS + extra_jars
        # The place where all jars will be copied to locally.
        local_jar_dir = '/tmp/hadoop_users/%s/%s' % \
                        (self.config.USER, utils.get_random_string())
        download_jar_cmds = [
            'hadoop fs -get %s %s' % (s3_dir, local_jar_dir)
            for s3_dir in s3_jar_dirs
        download_jar_cmd = ' && '.join(download_jar_cmds)
        appjar_name = s3_utils.extract_file_name_from_s3_path(
        download_jar_cmd += ' && hadoop fs -get %s %s/%s' % (
            self.config.USER_APPJAR_PATH, local_jar_dir, appjar_name)

        # Set default JobConf args.
        jobconf_args = {} if jobconf_args is None else jobconf_args.copy()
        if self.config.SCHEDULER_QUEUE:
            jobconf_args[self.config.SCHEDULER_PARAM] = \
        jobconf_args['mapred.job.name'] = self.job_name

        # Create arguments.
        arguments = ' '.join('-D%s=%s' % (k, v)
                             for k, v in jobconf_args.iteritems())
        arguments += ' '
        arguments += ' '.join(extra_args)

        libjars = self._get_libjars_local_paths(s3_jar_dirs, local_jar_dir)
        hadoop_classpath = '%s/*' % local_jar_dir

        cmd = 'mkdir -p %(local_jar_dir)s && %(download_jar_cmd)s'

        files_to_be_deleted = []
        for qubole_jar in self.config.QUBOLE_JARS_BLACKLIST:
            files_to_be_deleted.append('%s/%s' % (local_jar_dir, qubole_jar))
        if files_to_be_deleted:
            cmd += ' && rm -f %s' % (' && rm -f '.join(files_to_be_deleted))

        # Generate command.
        var_dict = {
            'class_name': class_name,
            'arguments': arguments,
            'appjar_name': appjar_name,
            'download_jar_cmd': download_jar_cmd,
            'local_jar_dir': local_jar_dir,
            'hadoop_classpath': hadoop_classpath,
            'libjars': libjars,
        cmd += (' && export HADOOP_CLASSPATH=%(hadoop_classpath)s'
                ' && hadoop jar %(local_jar_dir)s/%(appjar_name)s'
                ' %(class_name)s'
                ' -libjars %(libjars)s'
                ' %(arguments)s')
        cmd += ';\nEXIT_CODE=$?; \nrm -rf %(local_jar_dir)s; \nexit $EXIT_CODE;'
        cmd = cmd % var_dict

        # Log command messages.
        self.log.info('Full command: %s' % cmd)

        # Run command.
        hc, output, stderr, job_ids = self.run_shell_command(cmd)
        return output, stderr, job_ids
Exemplo n.º 4
    def run_hadoop_job(self,
        """Run a Hadoop job in Qubole cluster.

        We assume extra_jars are stored on s3 and the path looks like:

        We fail the entire command if pulling the JARs down from s3 fails,
        so we use "&&" to connect shell commands.
        jobconf_args = jobconf_args if jobconf_args else {}
        extra_args = extra_args if extra_args else []
        extra_jars = extra_jars if extra_jars else []

        # The place where all jars are stored in s3.
        s3_jar_dirs = self.config.USER_LIBJAR_DIRS + extra_jars
        # The place where all jars will be copied to locally.
        local_jar_dir = '/tmp/hadoop_users/%s/%s' % \
                        (self.config.USER, utils.get_random_string())
        download_jar_cmds = ['hadoop fs -get %s %s' % (s3_dir, local_jar_dir)
                             for s3_dir in s3_jar_dirs]
        download_jar_cmd = ' && '.join(download_jar_cmds)
        appjar_name = s3_utils.extract_file_name_from_s3_path(
        download_jar_cmd += ' && hadoop fs -get %s %s/%s' % (

        # Set default JobConf args.
        jobconf_args = {} if jobconf_args is None else jobconf_args.copy()
        if self.config.SCHEDULER_QUEUE:
            jobconf_args[self.config.SCHEDULER_PARAM] = \
        jobconf_args['mapred.job.name'] = self.job_name

        # Create arguments.
        arguments = ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems())
        arguments += ' '
        arguments += ' '.join(extra_args)

        libjars = self._get_libjars_local_paths(s3_jar_dirs, local_jar_dir)
        hadoop_classpath = '%s/*' % local_jar_dir

        cmd = 'mkdir -p %(local_jar_dir)s && %(download_jar_cmd)s'

        files_to_be_deleted = []
        for qubole_jar in self.config.QUBOLE_JARS_BLACKLIST:
            files_to_be_deleted.append('%s/%s' % (local_jar_dir, qubole_jar))
        if files_to_be_deleted:
            cmd += ' && rm -f %s' % (' && rm -f '.join(files_to_be_deleted))

        # Generate command.
        var_dict = {
            'class_name': class_name,
            'arguments': arguments,
            'appjar_name': appjar_name,
            'download_jar_cmd': download_jar_cmd,
            'local_jar_dir': local_jar_dir,
            'hadoop_classpath': hadoop_classpath,
            'libjars': libjars,
        cmd += (' && export HADOOP_CLASSPATH=%(hadoop_classpath)s'
                ' && hadoop jar %(local_jar_dir)s/%(appjar_name)s'
                ' %(class_name)s'
                ' -libjars %(libjars)s'
                ' %(arguments)s')
        cmd += ';\nEXIT_CODE=$?; \nrm -rf %(local_jar_dir)s; \nexit $EXIT_CODE;'
        cmd = cmd % var_dict

        # Log command messages.
        self.log.info('Full command: %s' % cmd)

        # Run command.
        hc, output, stderr, job_ids = self.run_shell_command(cmd)
        return output, stderr, job_ids