Пример #1
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        # todo: extract ugi from output_format, support multiple clusters and ugis
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)

            if utils.is_infinite(pcollection):
                if not path_util.is_hdfs_path(uri):
                    raise ValueError("That write infinite PType to local file "
                            "is not supported in MRPipeline")
                else:
                    output_format.path = self._toft_path(uri)
            else:
                if not path_util.is_hdfs_path(uri):
                    # User try to use MRPipeline to write local file, we replace original uri
                    # to a temp path on HDFS and dump the output for local FS after job is done.
                    hdfs_uri = self._tmp_hdfs_path(uri)
                    output_format.path = self._toft_path(hdfs_uri)
                    self._local_uri_infos.append({
                        'local_uri': uri,
                        'hdfs_uri': hdfs_uri,
                        'overwrite': output_format.overwrite
                    })
                    logger.debug(
                            "Write file to HDFS path: %s and dump it after job done" % hdfs_uri)
                    self._remote_temp_files.append(hdfs_uri)
                else:
                    output_format.path = self._toft_path(self._tmp_output_path(uri))
                    output_format.commit_path = self._toft_path(uri)

        return output_format
Пример #2
0
    def __append_cache_archive(self, file_path, resource_path):
        resource_path = resource_path.rstrip("/")

        if not path_util.is_hdfs_path(file_path):
            file_path = os.path.abspath(file_path)

        cache_archives = self._resource.cache_archive_list
        if cache_archives is None:
            cache_archives = "%s#%s" % (file_path, resource_path)
        else:
            cache_archives = "%s,%s#%s" % (cache_archives, file_path, resource_path)
        self._resource.cache_archive_list = cache_archives
Пример #3
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        # todo: extract ugi from output_format, support multiple clusters and ugis
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)

            if utils.is_infinite(pcollection):
                if not path_util.is_hdfs_path(uri):
                    raise ValueError("That write infinite PType to local file "
                                     "is not supported in MRPipeline")
                else:
                    output_format.path = self._toft_path(uri)
            else:
                if not path_util.is_hdfs_path(uri):
                    # User try to use MRPipeline to write local file, we replace original uri
                    # to a temp path on HDFS and dump the output for local FS after job is done.
                    hdfs_uri = self._tmp_hdfs_path(uri)
                    output_format.path = self._toft_path(hdfs_uri)
                    self._local_uri_infos.append({
                        'local_uri':
                        uri,
                        'hdfs_uri':
                        hdfs_uri,
                        'overwrite':
                        output_format.overwrite
                    })
                    logger.debug(
                        "Write file to HDFS path: %s and dump it after job done"
                        % hdfs_uri)
                    self._remote_temp_files.append(hdfs_uri)
                else:
                    output_format.path = self._toft_path(
                        self._tmp_output_path(uri))
                    output_format.commit_path = self._toft_path(uri)

        return output_format
Пример #4
0
    def __append_cache_archive(self, file_path, resource_path):
        resource_path = resource_path.rstrip("/")

        if not path_util.is_hdfs_path(file_path):
            file_path = os.path.abspath(file_path)

        cache_archives = self._resource.cache_archive_list
        if cache_archives is None:
            cache_archives = "%s#%s" % (file_path, resource_path)
        else:
            cache_archives = "%s,%s#%s" % (cache_archives, file_path,
                                           resource_path)
        self._resource.cache_archive_list = cache_archives
Пример #5
0
    def _transform_uri(self, uri, format_type, ugi=None):
        from bigflow.util import path_util

        if format_type == "TextInputFormat" or \
                format_type == "SequenceFileAsBinaryInputFormat":
            uri = path_util.to_abs_local_path(uri)

            if not path_util.is_hdfs_path(uri):
                return self._upload_file(uri)
            else:
                return uri
        # todo: support multiple clusters and ugis for spark_pipeline
        return uri
Пример #6
0
    def _transform_uri(self, uri, format_type, ugi=None):
        from bigflow.util import path_util

        if format_type == "TextInputFormat" or \
                format_type == "SequenceFileAsBinaryInputFormat":
            uri = path_util.to_abs_local_path(uri)

            if not path_util.is_hdfs_path(uri):
                return self._upload_file(uri)
            else:
                return uri
        # todo: support multiple clusters and ugis for spark_pipeline
        return uri
Пример #7
0
    def add_file(self, file_path, resource_path=None, executable=False):
        """
        向Pipeline添加单个文件,使得该文件能够在运行期被访问

        Args:
          file_path(str): 需要添加的文件路径,可以是本地路径或者 HDFS 路径.
          resource_path(str): local 引擎运行时访问该文件的路径, 应是相对路径. 也即本地引擎在执行时,
                              file_path 将会被映射到该 resource_path, 用户程序可以以该路径访问
          executable (bool): 若为True,则该文件在运行期会被添加可执行属性
        """
        if path_util.is_hdfs_path(file_path.lower()):
            self._add_remote_file(file_path, resource_path, executable)
        else:
            self._resource.add_file(file_path, resource_path, executable)
Пример #8
0
    def add_file(self, file_path, resource_path=None, executable=False):
        """
        向Pipeline添加单个文件,使得该文件能够在运行期被访问

        Args:
          file_path(str): 需要添加的文件路径,可以是本地路径或者 HDFS 路径.
          resource_path(str): local 引擎运行时访问该文件的路径, 应是相对路径. 也即本地引擎在执行时,
                              file_path 将会被映射到该 resource_path, 用户程序可以以该路径访问
          executable (bool): 若为True,则该文件在运行期会被添加可执行属性
        """
        if path_util.is_hdfs_path(file_path.lower()):
            self._add_remote_file(file_path, resource_path, executable)
        else:
            self._resource.add_file(file_path, resource_path, executable)
Пример #9
0
    def add_file(self, file_path, resource_path=None, executable=False):
        """
        向Pipeline添加单个文件,使得该文件能够在运行期被访问

        Args:
          file_path(str): 需要添加的文件路径,支持本地, HDFS 路径
          resource_path (str): 远端运行时访问该文件的本地路径, 应是相对路径. 也即在远端, file_path 将会被映射
                               成该 resource_path 路径, 用户程序可以直接用该路径访问到 file_path 对应的文件
          executable (bool): 若为True,则该文件在运行期会被添加可执行属性
        """
        if path_util.is_hdfs_path(file_path.lower()):
            if executable:
                logger.warn("Set executable for cache file is not supported yet, "
                            "ignore executable property")
            self.__append_cache_file(file_path, resource_path, executable)
        else:
            self._resource.add_file(file_path, resource_path, executable)
Пример #10
0
    def add_file(self, file_path, resource_path=None, executable=False):
        """
        向Pipeline添加单个文件,使得该文件能够在运行期被访问

        Args:
          file_path(str): 需要添加的文件路径,支持本地, HDFS 路径
          resource_path (str): 远端运行时访问该文件的本地路径, 应是相对路径. 也即在远端, file_path 将会被映射
                               成该 resource_path 路径, 用户程序可以直接用该路径访问到 file_path 对应的文件
          executable (bool): 若为True,则该文件在运行期会被添加可执行属性
        """
        if path_util.is_hdfs_path(file_path.lower()):
            if executable:
                logger.warn(
                    "Set executable for cache file is not supported yet, "
                    "ignore executable property")
            self.__append_cache_file(file_path, resource_path, executable)
        else:
            self._resource.add_file(file_path, resource_path, executable)
Пример #11
0
 def _path_exists(self, path):
     if path_util.is_hdfs_path(path):
         return self._hadoop_client().fs_test(path, self._hadoop_config)
     return os.path.exists(path)
Пример #12
0
 def _path_exists(self, path):
     if path_util.is_hdfs_path(path):
         return self._hadoop_client().fs_test(path, self._hadoop_config)
     return os.path.exists(path)