def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() # todo: extract ugi from output_format, support multiple clusters and ugis if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): if not path_util.is_hdfs_path(uri): raise ValueError("That write infinite PType to local file " "is not supported in MRPipeline") else: output_format.path = self._toft_path(uri) else: if not path_util.is_hdfs_path(uri): # User try to use MRPipeline to write local file, we replace original uri # to a temp path on HDFS and dump the output for local FS after job is done. hdfs_uri = self._tmp_hdfs_path(uri) output_format.path = self._toft_path(hdfs_uri) self._local_uri_infos.append({ 'local_uri': uri, 'hdfs_uri': hdfs_uri, 'overwrite': output_format.overwrite }) logger.debug( "Write file to HDFS path: %s and dump it after job done" % hdfs_uri) self._remote_temp_files.append(hdfs_uri) else: output_format.path = self._toft_path(self._tmp_output_path(uri)) output_format.commit_path = self._toft_path(uri) return output_format
def _transform_uri(self, uri, format_type, ugi=None): import glob from bigflow.util import path_util if format_type == "TextInputFormat" or\ format_type == "SequenceFileAsBinaryInputFormat" or\ format_type == "OrcInputFormat" or\ format_type == "ParquetInputFormat" or \ format_type == "TextStreamInputFormat" or\ format_type == "SequenceStreamInputFormat": uri = path_util.to_abs_local_path(uri) uri = self._toft_path(uri) if not path_util.is_toft_style_dfs_path(uri): # support local file glob uri = glob.glob(uri) return uri if format_type in ["TextInputFormatWithUgi", "SequenceStreamInputFormatWithUgi"]: uri = path_util.to_abs_local_path(uri) ret = self._toft_path(uri, ugi) return ret return uri
def _transform_uri(self, uri, format_type, ugi=None): from bigflow.util import path_util if format_type == "TextInputFormat" or \ format_type == "SequenceFileAsBinaryInputFormat": uri = path_util.to_abs_local_path(uri) if not path_util.is_hdfs_path(uri): return self._upload_file(uri) else: return uri # todo: support multiple clusters and ugis for spark_pipeline return uri
def _transform_uri(self, uri, format_type, ugi=None): import glob from bigflow.util import path_util if format_type == "TextInputFormat" or\ format_type == "SequenceFileAsBinaryInputFormat" or\ format_type == "OrcInputFormat" or\ format_type == "ParquetInputFormat" or \ format_type == "TextStreamInputFormat" or\ format_type == "SequenceStreamInputFormat": uri = path_util.to_abs_local_path(uri) uri = self._toft_path(uri) if not path_util.is_toft_style_dfs_path(uri): # support local file glob uri = glob.glob(uri) return uri if format_type in [ "TextInputFormatWithUgi", "SequenceStreamInputFormatWithUgi" ]: uri = path_util.to_abs_local_path(uri) ret = self._toft_path(uri, ugi) return ret return uri
def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() ugi = output_format.ugi if hasattr(output_format, "ugi") else None if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat" or \ format_type == "ParquetOutputFormat" or \ format_type == "PartitionedParquetOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): output_format.path = self._toft_path(uri, ugi) else: output_format.path = self._toft_path(self._tmp_output_path(uri), ugi) output_format.commit_path = self._toft_path(uri, ugi) return output_format
def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() ugi = output_format.ugi if hasattr(output_format, "ugi") else None if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat" or \ format_type == "ParquetOutputFormat" or \ format_type == "PartitionedParquetOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): output_format.path = self._toft_path(uri, ugi) else: output_format.path = self._toft_path( self._tmp_output_path(uri), ugi) output_format.commit_path = self._toft_path(uri, ugi) return output_format
def __init__(self, path, **options): super(SequenceFile, self).__init__(path, **options) self.output_format = _SequenceFileAsBinaryOutputFormat( path_util.to_abs_local_path(path.rstrip("/")), **options) self.kv_serializer = None self.options = options # 只有当用户把value_serde和key_serde都设置或者都不设置时时才会生效 # 否则抛出错误 k_serde = options.get("key_serde", None) v_serde = options.get("value_serde", None) if (not k_serde) != (not v_serde): raise error.InvalidSeqSerdeException("key and value serde should be both set or not.") elif (k_serde is not None) and (v_serde is not None): self.kv_serializer = entity.KVSerializeFn(k_serde, v_serde) else: self.kv_serializer = None
def __init__(self, path, **options): super(SequenceFile, self).__init__(path, **options) self.output_format = _SequenceFileAsBinaryOutputFormat( path_util.to_abs_local_path(path.rstrip("/")), **options) self.kv_serializer = None self.options = options # 只有当用户把value_serde和key_serde都设置或者都不设置时时才会生效 # 否则抛出错误 k_serde = options.get("key_serde", None) v_serde = options.get("value_serde", None) if (not k_serde) != (not v_serde): raise error.InvalidSeqSerdeException( "key and value serde should be both set or not.") elif (k_serde is not None) and (v_serde is not None): self.kv_serializer = entity.KVSerializeFn(k_serde, v_serde) else: self.kv_serializer = None
def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() # todo: extract ugi from output_format, support multiple clusters and ugis if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): if not path_util.is_hdfs_path(uri): raise ValueError("That write infinite PType to local file " "is not supported in MRPipeline") else: output_format.path = self._toft_path(uri) else: if not path_util.is_hdfs_path(uri): # User try to use MRPipeline to write local file, we replace original uri # to a temp path on HDFS and dump the output for local FS after job is done. hdfs_uri = self._tmp_hdfs_path(uri) output_format.path = self._toft_path(hdfs_uri) self._local_uri_infos.append({ 'local_uri': uri, 'hdfs_uri': hdfs_uri, 'overwrite': output_format.overwrite }) logger.debug( "Write file to HDFS path: %s and dump it after job done" % hdfs_uri) self._remote_temp_files.append(hdfs_uri) else: output_format.path = self._toft_path( self._tmp_output_path(uri)) output_format.commit_path = self._toft_path(uri) return output_format
def __init__(self, path, **options): super(TextFile, self).__init__(path, **options) self.output_format = _TextOutputFormat( path_util.to_abs_local_path(path.rstrip("/")), **options)