def run_class(class_name, args=None, properties=None, classpath=None, hadoop_conf_dir=None, logger=None): """ Run a class that needs the Hadoop jars in its class path. ``args`` and ``properties`` are passed to :func:`run_cmd`. .. code-block:: python >>> cls = 'org.apache.hadoop.hdfs.tools.DFSAdmin' >>> print run_class(cls, args=['-help', 'report']) -report: Reports basic filesystem information and statistics. """ if logger is None: logger = utils.NullLogger() old_classpath = None if classpath: old_classpath = os.getenv('HADOOP_CLASSPATH', '') if isinstance(classpath, basestring): classpath = [classpath] classpath_list = [cp.strip() for s in classpath for cp in s.split(":")] os.environ['HADOOP_CLASSPATH'] = ":".join(classpath_list) logger.debug('HADOOP_CLASSPATH %s', os.environ['HADOOP_CLASSPATH']) res = run_cmd(class_name, args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger) if old_classpath is not None: os.environ['HADOOP_CLASSPATH'] = old_classpath return res
def __init__(self, prefix=None, logger=None): self.wd = self.exe = self.input = self.output = None self.logger = logger or utils.NullLogger() if prefix: self.wd = utils.make_random_str(prefix=prefix) hdfs.mkdir(self.wd) for n in "input", "output": setattr(self, n, hdfs.path.join(self.wd, n))
def run_cmd(cmd, args=None, properties=None, hadoop_home=None, hadoop_conf_dir=None, logger=None, keep_streams=True): """ Run a Hadoop command. If ``keep_streams`` is set to :obj:`True` (the default), the stdout and stderr of the command will be buffered in memory. If the command succeeds, the former will be returned; if it fails, a ``RunCmdError`` will be raised with the latter as the message. This mode is appropriate for short-running commands whose "result" is represented by their standard output (e.g., ``"dfsadmin", ["-safemode", "get"]``). If ``keep_streams`` is set to :obj:`False`, the command will write directly to the stdout and stderr of the calling process, and the return value will be empty. This mode is appropriate for long running commands that do not write their "real" output to stdout (such as pipes). .. code-block:: python >>> hadoop_classpath = run_cmd('classpath') """ if logger is None: logger = utils.NullLogger() hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home) _args = [hadoop] if hadoop_conf_dir: _args.extend(["--config", hadoop_conf_dir]) _args.append(cmd) if properties: _args.extend(_construct_property_args(properties)) if args: if isinstance(args, basestring): args = shlex.split(args) _merge_csv_args(args) gargs = _pop_generic_args(args) for seq in gargs, args: _args.extend(map(str, seq)) logger.debug('final args: %r' % (_args, )) if keep_streams: p = subprocess.Popen(_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = p.communicate() else: p = subprocess.Popen(_args, stdout=None, stderr=None, bufsize=1) ret = p.wait() error = 'command exited with %d status' % ret if ret else '' output = '' if p.returncode: raise RunCmdError(p.returncode, ' '.join(_args), error) return output
def run_class(class_name, args=None, properties=None, classpath=None, hadoop_conf_dir=None, logger=None, keep_streams=True): """ Run a Java class with Hadoop (equivalent of running ``hadoop <class_name>`` from the command line). Additional ``HADOOP_CLASSPATH`` elements can be provided via ``classpath`` (either as a non-string sequence where each element is a classpath element or as a ``':'``-separated string). Other arguments are passed to :func:`run_cmd`. .. code-block:: python >>> cls = 'org.apache.hadoop.fs.FsShell' >>> try: out = run_class(cls, args=['-test', '-e', 'file:/tmp']) ... except RunCmdError: tmp_exists = False ... else: tmp_exists = True .. note:: ``HADOOP_CLASSPATH`` makes dependencies available **only on the client side**. If you are running a MapReduce application, use ``args=['-libjars', 'jar1,jar2,...']`` to make them available to the server side as well. """ if logger is None: logger = utils.NullLogger() old_classpath = None if classpath: old_classpath = os.getenv('HADOOP_CLASSPATH', '') if isinstance(classpath, basestring): classpath = [classpath] # Prepend the classpaths provided by the user to the existing # HADOOP_CLASSPATH value. Order matters. We could work a little # harder to avoid duplicates, but it's not essential os.environ['HADOOP_CLASSPATH'] = ":".join(classpath + old_classpath.split(':', 1)) logger.debug('HADOOP_CLASSPATH: %r', os.getenv('HADOOP_CLASSPATH')) try: res = run_cmd(class_name, args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger, keep_streams=keep_streams) finally: if old_classpath is not None: os.environ['HADOOP_CLASSPATH'] = old_classpath return res
def __init__(self, prefix=None, logger=None): hadoop_version_info = pydoop.hadoop_version_info() if hadoop_version_info.is_local(): raise pydoop.LocalModeNotSupported() self.wd = self.exe = self.input = self.output = None self.logger = logger or utils.NullLogger() if prefix: self.wd = utils.make_random_str(prefix=prefix) hdfs.mkdir(self.wd) for n in "input", "output": setattr(self, n, hdfs.path.join(self.wd, n))
def run_cmd(cmd, args=None, properties=None, hadoop_home=None, hadoop_conf_dir=None, logger=None): """ Run a Hadoop command. If the command succeeds, return its output; if it fails, raise a ``RunCmdError`` with its error output as the message. .. code-block:: python >>> import uuid >>> properties = {'dfs.block.size': 32*2**20} >>> args = ['-put', 'hadut.py', uuid.uuid4().hex] >>> res = run_cmd('fs', args, properties) >>> res '' >>> print run_cmd('dfsadmin', ['-help', 'report']) -report: Reports basic filesystem information and statistics. >>> try: ... run_cmd('foo') ... except RunCmdError as e: ... print e ... Exception in thread "main" java.lang.NoClassDefFoundError: foo ... """ if logger is None: logger = utils.NullLogger() hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home) _args = [hadoop] if hadoop_conf_dir: _args.extend(["--config", hadoop_conf_dir]) _args.append(cmd) if properties: _args.extend(_construct_property_args(properties)) if args: if isinstance(args, basestring): args = shlex.split(args) _merge_csv_args(args) gargs = _pop_generic_args(args) for seq in gargs, args: _args.extend(map(str, seq)) logger.debug('final args: %r' % (_args, )) p = subprocess.Popen(_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = p.communicate() if p.returncode: raise RunCmdError(p.returncode, ' '.join(_args), error) return output
def run_pipes(executable, input_path, output_path, more_args=None, properties=None, force_pydoop_submitter=False, hadoop_conf_dir=None, logger=None): """ Run a pipes command. ``more_args`` (after setting input/output path) and ``properties`` are passed to :func:`run_cmd`. If not specified otherwise, this function sets the properties ``hadoop.pipes.java.recordreader`` and ``hadoop.pipes.java.recordwriter`` to ``"true"``. This function works around a bug in Hadoop pipes that affects versions of Hadoop with security when the local file system is used as the default FS (no HDFS); see https://issues.apache.org/jira/browse/MAPREDUCE-4000. In those set-ups, the function uses Pydoop's own pipes submitter application. You can force the use of Pydoop's submitter by passing the argument force_pydoop_submitter=True. """ if logger is None: logger = utils.NullLogger() if not hdfs.path.exists(executable): raise IOError("executable %s not found" % executable) if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS): raise IOError("input path %s not found" % input_path) if properties is None: properties = {} properties.setdefault('hadoop.pipes.java.recordreader', 'true') properties.setdefault('hadoop.pipes.java.recordwriter', 'true') if force_pydoop_submitter: use_pydoop_submit = True else: use_pydoop_submit = False ver = pydoop.hadoop_version_info() if ver.has_security(): if ver.is_cdh_mrv2() and hdfs.default_is_local(): raise RuntimeError("mrv2 on local fs not supported yet") use_pydoop_submit = hdfs.default_is_local() args = [ "-program", executable, "-input", input_path, "-output", output_path, ] if more_args is not None: args.extend(more_args) if use_pydoop_submit: submitter = "it.crs4.pydoop.pipes.Submitter" pydoop_jar = pydoop.jar_path() args.extend(("-libjars", pydoop_jar)) return run_class(submitter, args, properties, classpath=pydoop_jar, logger=logger) else: return run_cmd("pipes", args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger)