Пример #1
0
    def _simulate_jobconf_for_step(self,
                                   task_type,
                                   step_num,
                                   task_num,
                                   map_split=None):
        j = {}

        # TODO: these are really poor imtations of Hadoop IDs. See #1254
        j['mapreduce.job.id'] = self._job_key
        j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % (
            self._job_key, task_type.lower(), step_num, task_num)
        j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % (
            self._job_key, task_type.lower(), step_num, task_num)

        j['mapreduce.task.ismap'] = str(task_type == 'mapper').lower()

        # TODO: is this the correct format?
        j['mapreduce.task.partition'] = str(task_num)

        j['mapreduce.task.output.dir'] = self._output_dir_for_step(step_num)

        working_dir = self._task_working_dir(task_type, step_num, task_num)
        j['mapreduce.job.local.dir'] = working_dir

        for x in ('archive', 'file'):
            named_paths = sorted(self._working_dir_mgr.name_to_path(x).items())

            # mapreduce.job.cache.archives
            # mapreduce.job.cache.files
            j['mapreduce.job.cache.%ss' % x] = ','.join(
                '%s#%s' % (path, name) for name, path in named_paths)

            # mapreduce.job.cache.local.archives
            # mapreduce.job.cache.local.files
            j['mapreduce.job.cache.local.%ss' % x] = ','.join(
                join(working_dir, name) for name, path in named_paths)

        if map_split:
            j['mapreduce.map.input.file'] = 'file://' + map_split['file']
            j['mapreduce.map.input.length'] = str(map_split['length'])
            j['mapreduce.map.input.start'] = str(map_split['start'])

        # translate to correct version

        # don't use translate_jobconf_dict(); that's meant to add keys
        # to user-supplied jobconf
        hadoop_version = self.get_hadoop_version()

        if hadoop_version:
            return {
                translate_jobconf(k, hadoop_version): v
                for k, v in j.items()
            }
        else:
            return {
                tk: v
                for k, v in j.items()
                for tk in translate_jobconf_for_all_versions(k)
            }
Пример #2
0
    def _simulate_jobconf_for_step(
            self, task_type, step_num, task_num, map_split=None):
        j = {}

        # TODO: these are really poor imtations of Hadoop IDs. See #1254
        j['mapreduce.job.id'] = self._job_key
        j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % (
            self._job_key, task_type.lower(), step_num, task_num)
        j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % (
            self._job_key, task_type.lower(), step_num, task_num)

        j['mapreduce.task.ismap'] = str(task_type == 'mapper').lower()

        # TODO: is this the correct format?
        j['mapreduce.task.partition'] = str(task_num)

        j['mapreduce.task.output.dir'] = self._output_dir_for_step(step_num)

        working_dir = self._task_working_dir(task_type, step_num, task_num)
        j['mapreduce.job.local.dir'] = working_dir

        for x in ('archive', 'file'):
            named_paths = sorted(self._working_dir_mgr.name_to_path(x).items())

            # mapreduce.job.cache.archives
            # mapreduce.job.cache.files
            j['mapreduce.job.cache.%ss' % x] = ','.join(
                '%s#%s' % (path, name) for name, path in named_paths)

            # mapreduce.job.cache.local.archives
            # mapreduce.job.cache.local.files
            j['mapreduce.job.cache.local.%ss' % x] = ','.join(
                join(working_dir, name) for name, path in named_paths)

        if map_split:
            # mapreduce.map.input.file
            # mapreduce.map.input.start
            # mapreduce.map.input.length
            for key, value in map_split.items():
                j['mapreduce.map.input.' + key] = str(value)

        # translate to correct version

        # don't use translate_jobconf_dict(); that's meant to add keys
        # to user-supplied jobconf
        hadoop_version = self.get_hadoop_version()

        if hadoop_version:
            return {translate_jobconf(k, hadoop_version): v
                    for k, v in j.items()}
        else:
            return {tk: v for k, v in j.items()
                    for tk in translate_jobconf_for_all_versions(k)}
Пример #3
0
    def _sort_values_jobconf(self):
        """Jobconf dictionary to enable sorting by value.
        """
        if not self._sort_values:
            return {}

        # translate _SORT_VALUES_JOBCONF to the correct Hadoop version,
        # without logging a warning
        hadoop_version = self.get_hadoop_version()

        jobconf = {}
        for k, v in _SORT_VALUES_JOBCONF.items():
            if hadoop_version:
                jobconf[translate_jobconf(k, hadoop_version)] = v
            else:
                for j in translate_jobconf_for_all_versions(k):
                    jobconf[j] = v

        return jobconf
Пример #4
0
    def _sort_values_jobconf(self):
        """Jobconf dictionary to enable sorting by value.
        """
        if not self._sort_values:
            return {}

        # translate _SORT_VALUES_JOBCONF to the correct Hadoop version,
        # without logging a warning
        hadoop_version = self.get_hadoop_version()

        jobconf = {}
        for k, v in _SORT_VALUES_JOBCONF.items():
            if hadoop_version:
                jobconf[translate_jobconf(k, hadoop_version)] = v
            else:
                for j in translate_jobconf_for_all_versions(k):
                    jobconf[j] = v

        return jobconf
Пример #5
0
    def _simulate_jobconf_for_step(self,
                                   step_num,
                                   step_type,
                                   task_num,
                                   working_dir,
                                   input_file=None,
                                   input_start=None,
                                   input_length=None):
        """Simulate jobconf variables set by Hadoop to indicate input
        files, files uploaded, working directory, etc. for a particular step.

        Returns a dictionary mapping jobconf variable name
        (e.g. ``'mapreduce.map.input.file'``) to its value, which is always
        a string.
        """
        # By convention, we use the newer (Hadoop 2) jobconf names and
        # translate them at the very end.
        j = {}

        j['mapreduce.job.id'] = self._job_key
        j['mapreduce.task.output.dir'] = self._output_dir
        j['mapreduce.job.local.dir'] = working_dir
        # archives and files for jobconf
        cache_archives = []
        cache_files = []
        cache_local_archives = []
        cache_local_files = []

        files = self._working_dir_mgr.name_to_path('file').items()
        for name, path in files:
            cache_files.append('%s#%s' % (path, name))
            cache_local_files.append(os.path.join(working_dir, name))

        archives = self._working_dir_mgr.name_to_path('archive').items()
        for name, path in archives:
            cache_archives.append('%s#%s' % (path, name))
            cache_local_archives.append(os.path.join(working_dir, name))

        # TODO: could add mtime info here too (e.g.
        # mapreduce.job.cache.archives.timestamps) here too
        j['mapreduce.job.cache.files'] = (','.join(cache_files))
        j['mapreduce.job.cache.local.files'] = (','.join(cache_local_files))
        j['mapreduce.job.cache.archives'] = (','.join(cache_archives))
        j['mapreduce.job.cache.local.archives'] = (
            ','.join(cache_local_archives))

        # task and attempt IDs
        # TODO: these are a crappy imitation of task/attempt IDs (see #1254)
        j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % (
            self._job_key, step_type.lower(), step_num, task_num)
        # (we only have one attempt)
        j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % (
            self._job_key, step_type.lower(), step_num, task_num)

        # not actually sure what's correct for combiners here. It'll definitely
        # be true if we're just using pipes to simulate a combiner though
        j['mapreduce.task.ismap'] = str(step_type in ('mapper',
                                                      'combiner')).lower()

        j['mapreduce.task.partition'] = str(task_num)

        if input_file is not None:
            j['mapreduce.map.input.file'] = input_file
        if input_start is not None:
            j['mapreduce.map.input.start'] = str(input_start)
        if input_length is not None:
            j['mapreduce.map.input.length'] = str(input_length)

        version = self.get_hadoop_version()
        if version:
            # translate to correct version
            j = dict((translate_jobconf(k, version), v) for k, v in j.items())
        else:
            # use all versions
            j = dict((variant, v) for k, v in j.items()
                     for variant in translate_jobconf_for_all_versions(k))

        return j
Пример #6
0
 def test_translate_jobconf_for_all_versions(self):
     self.assertEqual(translate_jobconf_for_all_versions('user.name'),
                      ['mapreduce.job.user.name', 'user.name'])
     self.assertEqual(translate_jobconf_for_all_versions('foo.bar'),
                      ['foo.bar'])
Пример #7
0
 def test_translate_jobconf_for_all_versions(self):
     self.assertEqual(translate_jobconf_for_all_versions('user.name'),
                      ['mapreduce.job.user.name', 'user.name'])
     self.assertEqual(translate_jobconf_for_all_versions('foo.bar'),
                      ['foo.bar'])
Пример #8
0
    def _simulate_jobconf_for_step(
            self, step_num, step_type, task_num, working_dir,
            input_file=None, input_start=None, input_length=None):
        """Simulate jobconf variables set by Hadoop to indicate input
        files, files uploaded, working directory, etc. for a particular step.

        Returns a dictionary mapping jobconf variable name
        (e.g. ``'mapreduce.map.input.file'``) to its value, which is always
        a string.
        """
        # By convention, we use the newer (Hadoop 2) jobconf names and
        # translate them at the very end.
        j = {}

        j['mapreduce.job.id'] = self._job_key
        j['mapreduce.task.output.dir'] = self._output_dir
        j['mapreduce.job.local.dir'] = working_dir
        # archives and files for jobconf
        cache_archives = []
        cache_files = []
        cache_local_archives = []
        cache_local_files = []

        files = self._working_dir_mgr.name_to_path('file').items()
        for name, path in files:
            cache_files.append('%s#%s' % (path, name))
            cache_local_files.append(os.path.join(working_dir, name))

        archives = self._working_dir_mgr.name_to_path('archive').items()
        for name, path in archives:
            cache_archives.append('%s#%s' % (path, name))
            cache_local_archives.append(os.path.join(working_dir, name))

        # TODO: could add mtime info here too (e.g.
        # mapreduce.job.cache.archives.timestamps) here too
        j['mapreduce.job.cache.files'] = (','.join(cache_files))
        j['mapreduce.job.cache.local.files'] = (','.join(cache_local_files))
        j['mapreduce.job.cache.archives'] = (','.join(cache_archives))
        j['mapreduce.job.cache.local.archives'] = (
            ','.join(cache_local_archives))

        # task and attempt IDs
        j['mapreduce.task.id'] = 'task_%s_%s_%05d%d' % (
            self._job_key, step_type.lower(), step_num, task_num)
        # (we only have one attempt)
        j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%05d%d_0' % (
            self._job_key, step_type.lower(), step_num, task_num)

        # not actually sure what's correct for combiners here. It'll definitely
        # be true if we're just using pipes to simulate a combiner though
        j['mapreduce.task.ismap'] = str(
            step_type in ('mapper', 'combiner')).lower()

        j['mapreduce.task.partition'] = str(task_num)

        if input_file is not None:
            j['mapreduce.map.input.file'] = input_file
        if input_start is not None:
            j['mapreduce.map.input.start'] = str(input_start)
        if input_length is not None:
            j['mapreduce.map.input.length'] = str(input_length)

        version = self.get_hadoop_version()
        if version:
            # translate to correct version
            j = dict((translate_jobconf(k, version), v) for k, v in j.items())
        else:
            # use all versions
            j = dict((variant, v)
                     for k, v in j.items()
                     for variant in translate_jobconf_for_all_versions(k))

        return j
Пример #9
0
 def test_translate_jobconf_for_all_versions(self):
     self.assertEqual(translate_jobconf_for_all_versions("user.name"), ["mapreduce.job.user.name", "user.name"])
     self.assertEqual(translate_jobconf_for_all_versions("foo.bar"), ["foo.bar"])