def expand_paths(datapath_uri): """ If a URI contains wildcards, this function expands them. Returns a list of URIs. """ # simple case: the path simply exists if phdfs.path.exists(datapath_uri.geturl()): return [datapath_uri.geturl()] # second case: the path doesn't exist as it is. It may contain wildcards, so we try # listing the datapath with hadoop dfs. If we were to list with # pydoop.hdfs.ls we'd have to implement hadoop wildcards ourselves (perhaps with fnmatch) def process(ls_line): path = ls_line[(ls_line.rindex(' ') + 1):] url = Uri(urlparse.urlparse(path)) url.scheme = datapath_uri.scheme url.netloc = datapath_uri.netloc return url.geturl() try: # run -ls with hadoop dfs the process the output. # We drop the first line since it's something like "Found xx items". ls_output = subprocess.check_output([pydoop.hadoop_exec(), 'dfs', '-ls', datapath_uri.geturl()]).rstrip('\n').split('\n')[1:] # for each data line, run apply the 'process' function to transform it into a full URI return map(process, ls_output) except subprocess.CalledProcessError as e: print_err("Could not list datapath %s. Please check whether it exists" % datapath_uri.geturl()) print_err("Message:", str(e)) sys.exit(1)
def run_hadoop_jar(jar, class_name=None, additional_cp=None, properties=None, args_list=[]): """ Run a jar with "hadoop jar", optionally specifying the main class. """ if not os.path.exists(jar) or not os.access(jar, os.R_OK): raise ValueError("Can't read jar file %s" % jar) args = [pydoop.hadoop_exec(), 'jar', jar] if class_name: args.append(class_name) if additional_cp: env = copy.copy(os.environ) if type(additional_cp) == str: # wrap a single class path in a list additional_cp = [additional_cp] # Pass this classpath string to hadoop through the HADOOP_CLASSPATH # environment variable. If HADOOP_CLASSPATH is already defined, we'll # append our values to it. if env.has_key('HADOOP_CLASSPATH'): additional_cp.insert(0, env['HADOOP_CLASSPATH']) env['HADOOP_CLASSPATH'] = ":".join(additional_cp) else: env = os.environ if properties: args.extend( __construct_property_args(properties) ) args.extend(args_list) return subprocess.call(args, env=env)
def run_hadoop_jar(jar, class_name=None, additional_cp=None, properties=None, args_list=[]): """ Run a jar with "hadoop jar", optionally specifying the main class. """ if not os.path.exists(jar) or not os.access(jar, os.R_OK): raise ValueError("Can't read jar file %s" % jar) args = [pydoop.hadoop_exec(), 'jar', jar] if class_name: args.append(class_name) if additional_cp: env = copy.copy(os.environ) if type(additional_cp) == str: # wrap a single class path in a list additional_cp = [additional_cp] # Pass this classpath string to hadoop through the HADOOP_CLASSPATH # environment variable. If HADOOP_CLASSPATH is already defined, we'll # append our values to it. if env.has_key('HADOOP_CLASSPATH'): additional_cp.insert(0, env['HADOOP_CLASSPATH']) env['HADOOP_CLASSPATH'] = ":".join(additional_cp) else: env = os.environ if properties: args.extend(__construct_property_args(properties)) args.extend(args_list) return subprocess.call(args, env=env)
def run_cmd(cmd, args=None, properties=None, hadoop_home=None, hadoop_conf_dir=None, logger=None, keep_streams=True): """ Run a Hadoop command. If ``keep_streams`` is set to :obj:`True` (the default), the stdout and stderr of the command will be buffered in memory. If the command succeeds, the former will be returned; if it fails, a ``RunCmdError`` will be raised with the latter as the message. This mode is appropriate for short-running commands whose "result" is represented by their standard output (e.g., ``"dfsadmin", ["-safemode", "get"]``). If ``keep_streams`` is set to :obj:`False`, the command will write directly to the stdout and stderr of the calling process, and the return value will be empty. This mode is appropriate for long running commands that do not write their "real" output to stdout (such as pipes). .. code-block:: python >>> hadoop_classpath = run_cmd('classpath') """ if logger is None: logger = utils.NullLogger() hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home) _args = [hadoop] if hadoop_conf_dir: _args.extend(["--config", hadoop_conf_dir]) _args.append(cmd) if properties: _args.extend(_construct_property_args(properties)) if args: if isinstance(args, basestring): args = shlex.split(args) _merge_csv_args(args) gargs = _pop_generic_args(args) for seq in gargs, args: _args.extend(map(str, seq)) logger.debug('final args: %r' % (_args,)) if keep_streams: p = subprocess.Popen( _args, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) error = "" stderr_iterator = iter(p.stderr.readline, b"") for line in stderr_iterator: error += line logger.info("cmd stderr line: " + line.strip()) output, _ = p.communicate() else: p = subprocess.Popen(_args, stdout=None, stderr=None, bufsize=1) ret = p.wait() error = 'command exited with %d status' % ret if ret else '' output = '' if p.returncode: raise RunCmdError(p.returncode, ' '.join(_args), error) return output
def main(class_name, app_name, args): print >>sys.stderr, "Using hadoop executable", pydoop.hadoop_exec() print >>sys.stderr, "Using seal jar", seal.jar_path() retcode = seal_utilities.run_hadoop_jar(seal.jar_path(), class_name, args_list=args) if retcode != 0 and retcode != 3: # 3 for usage error print >>sys.stderr, "Error running", app_name return retcode
def run_cmd(cmd, args=None, properties=None, hadoop_home=None, hadoop_conf_dir=None, logger=None, keep_streams=True): """ Run a Hadoop command. If ``keep_streams`` is set to :obj:`True` (the default), the stdout and stderr of the command will be buffered in memory. If the command succeeds, the former will be returned; if it fails, a ``RunCmdError`` will be raised with the latter as the message. This mode is appropriate for short-running commands whose "result" is represented by their standard output (e.g., ``"dfsadmin", ["-safemode", "get"]``). If ``keep_streams`` is set to :obj:`False`, the command will write directly to the stdout and stderr of the calling process, and the return value will be empty. This mode is appropriate for long running commands that do not write their "real" output to stdout (such as pipes). .. code-block:: python >>> hadoop_classpath = run_cmd('classpath') """ if logger is None: logger = utils.NullLogger() hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home) _args = [hadoop] if hadoop_conf_dir: _args.extend(["--config", hadoop_conf_dir]) _args.append(cmd) if properties: _args.extend(_construct_property_args(properties)) if args: if isinstance(args, basestring): args = shlex.split(args) _merge_csv_args(args) gargs = _pop_generic_args(args) for seq in gargs, args: _args.extend(map(str, seq)) logger.debug('final args: %r' % (_args, )) if keep_streams: p = subprocess.Popen(_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = p.communicate() else: p = subprocess.Popen(_args, stdout=None, stderr=None, bufsize=1) ret = p.wait() error = 'command exited with %d status' % ret if ret else '' output = '' if p.returncode: raise RunCmdError(p.returncode, ' '.join(_args), error) return output
def run_pipes(executable, input_path, output_path, properties=None, args_list=[]): args = [pydoop.hadoop_exec(), "pipes"] properties = properties.copy() if properties else {} properties['hadoop.pipes.executable'] = executable args.extend( __construct_property_args(properties) ) args.extend(["-input", input_path, "-output", output_path]) args.extend(args_list) return subprocess.call(args)
def test_hadoop(): """ Test the hadoop configuration. Calls sys.exit if test fails. """ cmd = [pydoop.hadoop_exec(), 'dfs', '-stat', 'file:///'] try: subprocess.check_output(cmd) except subprocess.CalledProcessError as e: print_err("Error running hadoop program. Please check your environment (tried %s)" % ' '.join(cmd)) print_err("Message:", str(e)) sys.exit(2)
def run_cmd(cmd, args=None, properties=None, hadoop_home=None, hadoop_conf_dir=None, logger=None): """ Run a Hadoop command. If the command succeeds, return its output; if it fails, raise a ``RunCmdError`` with its error output as the message. .. code-block:: python >>> import uuid >>> properties = {'dfs.block.size': 32*2**20} >>> args = ['-put', 'hadut.py', uuid.uuid4().hex] >>> res = run_cmd('fs', args, properties) >>> res '' >>> print run_cmd('dfsadmin', ['-help', 'report']) -report: Reports basic filesystem information and statistics. >>> try: ... run_cmd('foo') ... except RunCmdError as e: ... print e ... Exception in thread "main" java.lang.NoClassDefFoundError: foo ... """ if logger is None: logger = utils.NullLogger() hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home) _args = [hadoop] if hadoop_conf_dir: _args.extend(["--config", hadoop_conf_dir]) _args.append(cmd) if properties: _args.extend(_construct_property_args(properties)) if args: if isinstance(args, basestring): args = shlex.split(args) _merge_csv_args(args) gargs = _pop_generic_args(args) for seq in gargs, args: _args.extend(map(str, seq)) logger.info('args %s, cmd %s, properties %s', _args, cmd, properties) p = subprocess.Popen(_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = p.communicate() if p.returncode: raise RunCmdError(p.returncode, ' '.join(_args), error) return output
def run_pipes(executable, input_path, output_path, properties=None, args_list=[]): args = [pydoop.hadoop_exec(), "pipes"] properties = properties.copy() if properties else {} properties['hadoop.pipes.executable'] = executable args.extend(__construct_property_args(properties)) args.extend(["-input", input_path, "-output", output_path]) args.extend(args_list) return subprocess.call(args)
def run_cmd(cmd, args=None, properties=None, hadoop_home=None, hadoop_conf_dir=None, logger=None, keep_streams=True): tool = pydoop.hadoop_exec(hadoop_home=hadoop_home) run_tool_cmd(tool, cmd, args=args, properties=properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger, keep_streams=keep_streams)
def perform_distcp(copy_groups): cmd_start = [ pydoop.hadoop_exec(), 'distcp2', '-atomic' ] try: for output_path, src_paths in copy_groups.iteritems(): cmd = cmd_start[:] cmd.extend(src_paths) cmd.append(output_path) log.debug("%s", cmd) subprocess.check_call(cmd) # Hadoop distcp2 doesn't seem to correctly report errors through its # exit code. For instance, it exists with a 0 even when the job is killed. # To verify its success we'll check that the destination directory exists. # Since we're using -atomic it should only exist if everything went well. if phdfs.path.exists(output_path): log.info("Successfully ran distcp") else: raise RuntimeError("Distcp2 failed to complete. Output path not created: %s" % output_path) except (subprocess.CalledProcessError, RuntimeError) as e: log.critical("Error running distcp: %s", e.message) raise e
The purpose of this example is to demonstrate the usage of SequenceFileInputFormat and SequenceFileOutputFormat. """ import os import optparse import logging logging.basicConfig(level=logging.INFO) import pydoop import pydoop.test_support as pts import pydoop.hadut as hadut HADOOP = pydoop.hadoop_exec() HADOOP_CONF_DIR = pydoop.hadoop_conf() OUTPUT = "output" LOCAL_WC_SCRIPT = "bin/wordcount.py" LOCAL_FILTER_SCRIPT = "bin/filter.py" THIS_DIR = os.path.dirname(os.path.abspath(__file__)) DEFAULT_INPUT = os.path.normpath(os.path.join(THIS_DIR, "../input")) MR_JOB_NAME = "mapred.job.name" MR_HOME_DIR = 'mapreduce.admin.user.home.dir' PIPES_JAVA_RR = "hadoop.pipes.java.recordreader" PIPES_JAVA_RW = "hadoop.pipes.java.recordwriter" MR_OUT_COMPRESS_TYPE = "mapred.output.compression.type" MR_REDUCE_TASKS = "mapred.reduce.tasks" MR_IN_CLASS = "mapred.input.format.class"