def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() # todo: extract ugi from output_format, support multiple clusters and ugis if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): if not path_util.is_hdfs_path(uri): raise ValueError("That write infinite PType to local file " "is not supported in MRPipeline") else: output_format.path = self._toft_path(uri) else: if not path_util.is_hdfs_path(uri): # User try to use MRPipeline to write local file, we replace original uri # to a temp path on HDFS and dump the output for local FS after job is done. hdfs_uri = self._tmp_hdfs_path(uri) output_format.path = self._toft_path(hdfs_uri) self._local_uri_infos.append({ 'local_uri': uri, 'hdfs_uri': hdfs_uri, 'overwrite': output_format.overwrite }) logger.debug( "Write file to HDFS path: %s and dump it after job done" % hdfs_uri) self._remote_temp_files.append(hdfs_uri) else: output_format.path = self._toft_path(self._tmp_output_path(uri)) output_format.commit_path = self._toft_path(uri) return output_format
def __append_cache_archive(self, file_path, resource_path): resource_path = resource_path.rstrip("/") if not path_util.is_hdfs_path(file_path): file_path = os.path.abspath(file_path) cache_archives = self._resource.cache_archive_list if cache_archives is None: cache_archives = "%s#%s" % (file_path, resource_path) else: cache_archives = "%s,%s#%s" % (cache_archives, file_path, resource_path) self._resource.cache_archive_list = cache_archives
def _transform_output_format(self, pcollection, output_format): from bigflow.util import path_util from bigflow.util import utils format_type = output_format.get_entity_name() # todo: extract ugi from output_format, support multiple clusters and ugis if format_type == "TextOutputFormat" or \ format_type == "SequenceFileAsBinaryOutputFormat": uri = path_util.to_abs_local_path(output_format.path) if utils.is_infinite(pcollection): if not path_util.is_hdfs_path(uri): raise ValueError("That write infinite PType to local file " "is not supported in MRPipeline") else: output_format.path = self._toft_path(uri) else: if not path_util.is_hdfs_path(uri): # User try to use MRPipeline to write local file, we replace original uri # to a temp path on HDFS and dump the output for local FS after job is done. hdfs_uri = self._tmp_hdfs_path(uri) output_format.path = self._toft_path(hdfs_uri) self._local_uri_infos.append({ 'local_uri': uri, 'hdfs_uri': hdfs_uri, 'overwrite': output_format.overwrite }) logger.debug( "Write file to HDFS path: %s and dump it after job done" % hdfs_uri) self._remote_temp_files.append(hdfs_uri) else: output_format.path = self._toft_path( self._tmp_output_path(uri)) output_format.commit_path = self._toft_path(uri) return output_format
def _transform_uri(self, uri, format_type, ugi=None): from bigflow.util import path_util if format_type == "TextInputFormat" or \ format_type == "SequenceFileAsBinaryInputFormat": uri = path_util.to_abs_local_path(uri) if not path_util.is_hdfs_path(uri): return self._upload_file(uri) else: return uri # todo: support multiple clusters and ugis for spark_pipeline return uri
def add_file(self, file_path, resource_path=None, executable=False): """ 向Pipeline添加单个文件,使得该文件能够在运行期被访问 Args: file_path(str): 需要添加的文件路径,可以是本地路径或者 HDFS 路径. resource_path(str): local 引擎运行时访问该文件的路径, 应是相对路径. 也即本地引擎在执行时, file_path 将会被映射到该 resource_path, 用户程序可以以该路径访问 executable (bool): 若为True,则该文件在运行期会被添加可执行属性 """ if path_util.is_hdfs_path(file_path.lower()): self._add_remote_file(file_path, resource_path, executable) else: self._resource.add_file(file_path, resource_path, executable)
def add_file(self, file_path, resource_path=None, executable=False): """ 向Pipeline添加单个文件,使得该文件能够在运行期被访问 Args: file_path(str): 需要添加的文件路径,支持本地, HDFS 路径 resource_path (str): 远端运行时访问该文件的本地路径, 应是相对路径. 也即在远端, file_path 将会被映射 成该 resource_path 路径, 用户程序可以直接用该路径访问到 file_path 对应的文件 executable (bool): 若为True,则该文件在运行期会被添加可执行属性 """ if path_util.is_hdfs_path(file_path.lower()): if executable: logger.warn("Set executable for cache file is not supported yet, " "ignore executable property") self.__append_cache_file(file_path, resource_path, executable) else: self._resource.add_file(file_path, resource_path, executable)
def add_file(self, file_path, resource_path=None, executable=False): """ 向Pipeline添加单个文件,使得该文件能够在运行期被访问 Args: file_path(str): 需要添加的文件路径,支持本地, HDFS 路径 resource_path (str): 远端运行时访问该文件的本地路径, 应是相对路径. 也即在远端, file_path 将会被映射 成该 resource_path 路径, 用户程序可以直接用该路径访问到 file_path 对应的文件 executable (bool): 若为True,则该文件在运行期会被添加可执行属性 """ if path_util.is_hdfs_path(file_path.lower()): if executable: logger.warn( "Set executable for cache file is not supported yet, " "ignore executable property") self.__append_cache_file(file_path, resource_path, executable) else: self._resource.add_file(file_path, resource_path, executable)
def _path_exists(self, path): if path_util.is_hdfs_path(path): return self._hadoop_client().fs_test(path, self._hadoop_config) return os.path.exists(path)