def set_args(self, args): """ Configure the pydoop script run, based on the arguments provided. """ self.logger.setLevel(getattr(logging, args.log_level)) parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/"))) self.remote_wd = hdfs.path.join( parent, utils.make_random_str(prefix="pydoop_script_") ) self.remote_exe = hdfs.path.join( self.remote_wd, utils.make_random_str(prefix="exe") ) module_bn = os.path.basename(args.module) self.remote_module_bn = utils.make_random_str( prefix="pydoop_script_", postfix=".py" ) self.remote_module = hdfs.path.join(self.remote_wd, self.remote_module_bn) dist_cache_parameter = "%s#%s" % (self.remote_module, self.remote_module_bn) self.properties['mapred.job.name'] = module_bn self.properties.update(dict(args.D or [])) self.properties['mapred.reduce.tasks'] = args.num_reducers self.properties['mapred.textoutputformat.separator'] = args.kv_separator if self.properties['mapred.cache.files']: self.properties['mapred.cache.files'] += ',' self.properties['mapred.cache.files'] += dist_cache_parameter self.args = args
def set_args(self, args): """ Configure the pydoop script run, based on the arguments provided. """ self.logger.setLevel(getattr(logging, args.log_level)) parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/"))) self.remote_wd = hdfs.path.join( parent, utils.make_random_str(prefix="pydoop_script_")) self.remote_exe = hdfs.path.join(self.remote_wd, utils.make_random_str(prefix="exe")) module_bn = os.path.basename(args.module) _, ext = module_ext = os.path.splitext(module_bn) # If the module doesn't have an extension, assume it should be .py # This could happen, for instance, if someone loads an executable module # as a script. We can't blindly add .py though since the module may be a .pyc if not ext: ext = '.py' self.remote_module_bn = utils.make_random_str(prefix="pydoop_script_", postfix=ext) self.remote_module = hdfs.path.join(self.remote_wd, self.remote_module_bn) dist_cache_parameter = "%s#%s" % (self.remote_module, self.remote_module_bn) self.properties['mapred.job.name'] = module_bn self.properties.update(dict(args.D or [])) self.properties['mapred.reduce.tasks'] = args.num_reducers self.properties[ 'mapred.textoutputformat.separator'] = args.kv_separator if self.properties['mapred.cache.files']: self.properties['mapred.cache.files'] += ',' self.properties['mapred.cache.files'] += dist_cache_parameter self.args = args
def set_args(self, args): """ Configure the pydoop script run, based on the arguments provided. """ self.logger.setLevel(getattr(logging, args.log_level)) parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/"))) self.remote_wd = hdfs.path.join( parent, utils.make_random_str(prefix="pydoop_script_") ) self.remote_exe = hdfs.path.join( self.remote_wd, utils.make_random_str(prefix="exe") ) module_bn = os.path.basename(args.module) _, ext = module_ext = os.path.splitext(module_bn) # If the module doesn't have an extension, assume it should be .py # This could happen, for instance, if someone loads an executable module # as a script. We can't blindly add .py though since the module may be a .pyc if not ext: ext = '.py' self.remote_module_bn = utils.make_random_str( prefix="pydoop_script_", postfix=ext ) self.remote_module = hdfs.path.join(self.remote_wd, self.remote_module_bn) dist_cache_parameter = "%s#%s" % (self.remote_module, self.remote_module_bn) self.properties['mapred.job.name'] = module_bn self.properties.update(dict(args.D or [])) self.properties['mapred.reduce.tasks'] = args.num_reducers self.properties['mapred.textoutputformat.separator'] = args.kv_separator if self.properties['mapred.cache.files']: self.properties['mapred.cache.files'] += ',' self.properties['mapred.cache.files'] += dist_cache_parameter self.args = args
def convert_args(self, args, unknown_args): # Create a zip archive containing all we need to run the # script (including the script itself. We use # NamedTemporaryFile which will take care of deleting the temp # archive once we're done self.script_archive = NamedTemporaryFile(prefix="pydoop_script_", suffix='.zip') zip_filename = self.script_archive.name # Create a one-off temporary file name to avoid name clashes # in the distcache. Keep the same module extension -- it may # be a source file or a byte-compiled file mr_module = utils.make_random_str(prefix="pydoop_script_module_", postfix=os.path.basename( args.module)) mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module) zipf.writestr( mr_driver + '.py', self.generate_driver(os.path.splitext(mr_module)[0], args)) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None args.keep_wd = False args.pstats_dir = None args.pstats_fmt = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args
def convert_args(self, args, unknown_args): zip_filename = utils.make_random_str(prefix="pydoop_script_", postfix='.zip') mr_module = utils.make_random_str(prefix="pydoop_script_module_") mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module + '.py') zipf.writestr(mr_driver + '.py', self.generate_driver(mr_module, args)) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.input_format = None args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.mrv2 = pydoop.hadoop_version_info().has_mrv2() args.local_fs = False args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args self.zip_filename = zip_filename
def convert_args(self, args, unknown_args): zip_filename = utils.make_random_str(prefix="pydoop_script_", postfix='.zip') mr_module = utils.make_random_str(prefix="pydoop_script_module_") mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module+'.py') zipf.writestr(mr_driver+'.py', self.generate_driver(mr_module, args)) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.input_format = None args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.mrv2 = pydoop.hadoop_version_info().has_mrv2() args.local_fs = False args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args self.zip_filename = zip_filename
def set_args(self, args, unknown_args=None): """ Configure job, based on the arguments provided. """ if unknown_args is None: unknown_args = [] self.logger.setLevel(getattr(logging, args.log_level)) parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/"))) self.remote_wd = hdfs.path.join( parent, utils.make_random_str(prefix="pydoop_submit_") ) self.remote_exe = hdfs.path.join(self.remote_wd, str(uuid.uuid4())) self.properties[JOB_NAME] = args.job_name or 'pydoop' self.properties[IS_JAVA_RR] = ( 'false' if args.do_not_use_java_record_reader else 'true' ) self.properties[IS_JAVA_RW] = ( 'false' if args.do_not_use_java_record_writer else 'true' ) self.properties[JOB_REDUCES] = args.num_reducers if args.job_name: self.properties[JOB_NAME] = args.job_name self.properties.update(dict(args.D or [])) self.properties.update(dict(args.job_conf or [])) self.__set_files_to_cache(args) self.__set_archives_to_cache(args) self.requested_env = self._env_arg_to_dict(args.set_env or []) self.args = args self.unknown_args = unknown_args
def set_args(self, args, unknown_args=None): """ Configure job, based on the arguments provided. """ if unknown_args is None: unknown_args = [] self.logger.setLevel(getattr(logging, args.log_level)) parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/"))) self.remote_wd = hdfs.path.join( parent, utils.make_random_str(prefix="pydoop_submit_")) self.remote_exe = hdfs.path.join(self.remote_wd, str(uuid.uuid4())) self.properties[JOB_NAME] = args.job_name or 'pydoop' self.properties[IS_JAVA_RR] = ( 'false' if args.do_not_use_java_record_reader else 'true') self.properties[IS_JAVA_RW] = ( 'false' if args.do_not_use_java_record_writer else 'true') self.properties[JOB_REDUCES] = args.num_reducers if args.job_name: self.properties[JOB_NAME] = args.job_name self.properties.update(dict(args.D or [])) self.properties.update(dict(args.job_conf or [])) self.__set_files_to_cache(args) self.__set_archives_to_cache(args) self.requested_env = self._env_arg_to_dict(args.set_env or []) self.args = args self.unknown_args = unknown_args
def set_args(self, args, unknown_args=[]): """ Configure job, based on the arguments provided. """ self.logger.setLevel(getattr(logging, args.log_level)) parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/"))) self.remote_wd = hdfs.path.join( parent, utils.make_random_str(prefix="pydoop_submit_")) self.remote_exe = args.program self.properties[JOB_NAME] = args.job_name or 'pydoop' self.properties[IS_JAVA_RR] = ( 'false' if args.do_not_use_java_record_reader else 'true') self.properties[IS_JAVA_RW] = ( 'false' if args.do_not_use_java_record_writer else 'true') if args.input_format: self.properties[(INPUT_FORMAT_MRV2 if args.mrv2 else INPUT_FORMAT_MRV1)] = args.input_format if args.output_format: self.properties[(OUTPUT_FORMAT_MRV2 if args.mrv2 else OUTPUT_FORMAT_MRV1)] = args.output_format self.properties[JOB_REDUCES] = args.num_reducers if args.job_name: self.properties[JOB_NAME] = args.job_name self.properties.update(dict(args.D or [])) self.properties.update(dict(args.job_conf or [])) self.__set_files_to_cache(args) self.__set_archives_to_cache(args) self.args = args self.unknown_args = unknown_args
def __init__(self, prefix=None, logger=None): self.wd = self.exe = self.input = self.output = None self.logger = logger or utils.NullLogger() if prefix: self.wd = utils.make_random_str(prefix=prefix) hdfs.mkdir(self.wd) for n in "input", "output": setattr(self, n, hdfs.path.join(self.wd, n))
def set_exe(self, pipes_code): """ Dump launcher code to the distributed file system. """ if not self.output: raise RuntimeError("no output directory, can't create launcher") parent = hdfs.path.dirname(hdfs.path.abspath(self.output.rstrip("/"))) self.exe = hdfs.path.join(parent, utils.make_random_str()) hdfs.dump(pipes_code, self.exe)
def mapper(_, record, writer, conf): out_dir = conf.get('out.dir', utils.make_random_str()) if not hdfs.path.isdir(out_dir): hdfs.mkdir(out_dir) hdfs.chmod(out_dir, 'g+rwx') img_path = record.strip() a = get_array(img_path) out_a = calc_features(a) out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path)) with hdfs.open(out_path, 'w') as fo: np.save(fo, out_a) # actual output hdfs.chmod(out_path, 'g+rw') writer.emit(img_path, fo.name) # info (tab-separated input-output)
def test_isdir(self): path = utils.make_random_str() self.assertFalse(hdfs.path.isdir(path)) try: hdfs.dump("foo\n", path) self.assertFalse(hdfs.path.isdir(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertTrue(hdfs.path.isdir(path)) finally: try: hdfs.rmr(path) except IOError: pass
def test_kind(self): path = utils.make_random_str() self.assertTrue(hdfs.path.kind(path) is None) try: hdfs.dump("foo\n", path) self.assertEqual('file', hdfs.path.kind(path)) hdfs.rmr(path) hdfs.mkdir(path) self.assertEqual('directory', hdfs.path.kind(path)) finally: try: hdfs.rmr(path) except IOError: pass
def good(self): path = utils.make_random_str() hdfs.dump("foo\n", path) self.assertTrue(hdfs.path.exists(path)) hdfs.rmr(path) self.assertFalse(hdfs.path.exists(path))
def convert_args(self, args, unknown_args): # Create a zip archive containing all we need to run the # script (including the script itself. We use # NamedTemporaryFile which will take care of deleting the temp # archive once we're done self.script_archive = NamedTemporaryFile( prefix="pydoop_script_", suffix='.zip' ) zip_filename = self.script_archive.name # Create a one-off temporary file name to avoid name clashes # in the distcache. Keep the same module extension -- it may # be a source file or a byte-compiled file mr_module = utils.make_random_str( prefix="pydoop_script_module_", postfix=os.path.basename(args.module) ) mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module) zipf.writestr( mr_driver + '.py', self.generate_driver(os.path.splitext(mr_module)[0], args) ) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None args.keep_wd = False args.pstats_dir = None args.pstats_fmt = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args