示例#1
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        # todo: extract ugi from output_format, support multiple clusters and ugis
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)

            if utils.is_infinite(pcollection):
                if not path_util.is_hdfs_path(uri):
                    raise ValueError("That write infinite PType to local file "
                            "is not supported in MRPipeline")
                else:
                    output_format.path = self._toft_path(uri)
            else:
                if not path_util.is_hdfs_path(uri):
                    # User try to use MRPipeline to write local file, we replace original uri
                    # to a temp path on HDFS and dump the output for local FS after job is done.
                    hdfs_uri = self._tmp_hdfs_path(uri)
                    output_format.path = self._toft_path(hdfs_uri)
                    self._local_uri_infos.append({
                        'local_uri': uri,
                        'hdfs_uri': hdfs_uri,
                        'overwrite': output_format.overwrite
                    })
                    logger.debug(
                            "Write file to HDFS path: %s and dump it after job done" % hdfs_uri)
                    self._remote_temp_files.append(hdfs_uri)
                else:
                    output_format.path = self._toft_path(self._tmp_output_path(uri))
                    output_format.commit_path = self._toft_path(uri)

        return output_format
示例#2
0
    def _transform_uri(self, uri, format_type, ugi=None):
        import glob
        from bigflow.util import path_util
        if format_type == "TextInputFormat" or\
                format_type == "SequenceFileAsBinaryInputFormat" or\
                format_type == "OrcInputFormat" or\
                format_type == "ParquetInputFormat" or \
                format_type == "TextStreamInputFormat" or\
                format_type == "SequenceStreamInputFormat":
            uri = path_util.to_abs_local_path(uri)
            uri = self._toft_path(uri)
            if not path_util.is_toft_style_dfs_path(uri):
                # support local file glob
                uri = glob.glob(uri)
            return uri

        if format_type in ["TextInputFormatWithUgi", "SequenceStreamInputFormatWithUgi"]:
            uri = path_util.to_abs_local_path(uri)
            ret = self._toft_path(uri, ugi)
            return ret
        return uri
示例#3
0
    def _transform_uri(self, uri, format_type, ugi=None):
        from bigflow.util import path_util

        if format_type == "TextInputFormat" or \
                format_type == "SequenceFileAsBinaryInputFormat":
            uri = path_util.to_abs_local_path(uri)

            if not path_util.is_hdfs_path(uri):
                return self._upload_file(uri)
            else:
                return uri
        # todo: support multiple clusters and ugis for spark_pipeline
        return uri
示例#4
0
    def _transform_uri(self, uri, format_type, ugi=None):
        from bigflow.util import path_util

        if format_type == "TextInputFormat" or \
                format_type == "SequenceFileAsBinaryInputFormat":
            uri = path_util.to_abs_local_path(uri)

            if not path_util.is_hdfs_path(uri):
                return self._upload_file(uri)
            else:
                return uri
        # todo: support multiple clusters and ugis for spark_pipeline
        return uri
示例#5
0
    def _transform_uri(self, uri, format_type, ugi=None):
        import glob
        from bigflow.util import path_util
        if format_type == "TextInputFormat" or\
                format_type == "SequenceFileAsBinaryInputFormat" or\
                format_type == "OrcInputFormat" or\
                format_type == "ParquetInputFormat" or \
                format_type == "TextStreamInputFormat" or\
                format_type == "SequenceStreamInputFormat":
            uri = path_util.to_abs_local_path(uri)
            uri = self._toft_path(uri)
            if not path_util.is_toft_style_dfs_path(uri):
                # support local file glob
                uri = glob.glob(uri)
            return uri

        if format_type in [
                "TextInputFormatWithUgi", "SequenceStreamInputFormatWithUgi"
        ]:
            uri = path_util.to_abs_local_path(uri)
            ret = self._toft_path(uri, ugi)
            return ret
        return uri
示例#6
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        ugi = output_format.ugi if hasattr(output_format, "ugi") else None
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat" or \
                format_type == "ParquetOutputFormat" or \
                format_type == "PartitionedParquetOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)
            if utils.is_infinite(pcollection):
                output_format.path = self._toft_path(uri, ugi)
            else:
                output_format.path = self._toft_path(self._tmp_output_path(uri), ugi)
                output_format.commit_path = self._toft_path(uri, ugi)

        return output_format
示例#7
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        ugi = output_format.ugi if hasattr(output_format, "ugi") else None
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat" or \
                format_type == "ParquetOutputFormat" or \
                format_type == "PartitionedParquetOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)
            if utils.is_infinite(pcollection):
                output_format.path = self._toft_path(uri, ugi)
            else:
                output_format.path = self._toft_path(
                    self._tmp_output_path(uri), ugi)
                output_format.commit_path = self._toft_path(uri, ugi)

        return output_format
示例#8
0
    def __init__(self, path, **options):
        super(SequenceFile, self).__init__(path, **options)

        self.output_format = _SequenceFileAsBinaryOutputFormat(
                path_util.to_abs_local_path(path.rstrip("/")), **options)

        self.kv_serializer = None
        self.options = options

        # 只有当用户把value_serde和key_serde都设置或者都不设置时时才会生效
        # 否则抛出错误
        k_serde = options.get("key_serde", None)
        v_serde = options.get("value_serde", None)

        if (not k_serde) != (not v_serde):
            raise error.InvalidSeqSerdeException("key and value serde should be both set or not.")
        elif (k_serde is not None) and (v_serde is not None):
            self.kv_serializer = entity.KVSerializeFn(k_serde, v_serde)
        else:
            self.kv_serializer = None
示例#9
0
    def __init__(self, path, **options):
        super(SequenceFile, self).__init__(path, **options)

        self.output_format = _SequenceFileAsBinaryOutputFormat(
            path_util.to_abs_local_path(path.rstrip("/")), **options)

        self.kv_serializer = None
        self.options = options

        # 只有当用户把value_serde和key_serde都设置或者都不设置时时才会生效
        # 否则抛出错误
        k_serde = options.get("key_serde", None)
        v_serde = options.get("value_serde", None)

        if (not k_serde) != (not v_serde):
            raise error.InvalidSeqSerdeException(
                "key and value serde should be both set or not.")
        elif (k_serde is not None) and (v_serde is not None):
            self.kv_serializer = entity.KVSerializeFn(k_serde, v_serde)
        else:
            self.kv_serializer = None
示例#10
0
    def _transform_output_format(self, pcollection, output_format):
        from bigflow.util import path_util
        from bigflow.util import utils

        format_type = output_format.get_entity_name()
        # todo: extract ugi from output_format, support multiple clusters and ugis
        if format_type == "TextOutputFormat" or \
                format_type == "SequenceFileAsBinaryOutputFormat":
            uri = path_util.to_abs_local_path(output_format.path)

            if utils.is_infinite(pcollection):
                if not path_util.is_hdfs_path(uri):
                    raise ValueError("That write infinite PType to local file "
                                     "is not supported in MRPipeline")
                else:
                    output_format.path = self._toft_path(uri)
            else:
                if not path_util.is_hdfs_path(uri):
                    # User try to use MRPipeline to write local file, we replace original uri
                    # to a temp path on HDFS and dump the output for local FS after job is done.
                    hdfs_uri = self._tmp_hdfs_path(uri)
                    output_format.path = self._toft_path(hdfs_uri)
                    self._local_uri_infos.append({
                        'local_uri':
                        uri,
                        'hdfs_uri':
                        hdfs_uri,
                        'overwrite':
                        output_format.overwrite
                    })
                    logger.debug(
                        "Write file to HDFS path: %s and dump it after job done"
                        % hdfs_uri)
                    self._remote_temp_files.append(hdfs_uri)
                else:
                    output_format.path = self._toft_path(
                        self._tmp_output_path(uri))
                    output_format.commit_path = self._toft_path(uri)

        return output_format
示例#11
0
    def __init__(self, path, **options):
        super(TextFile, self).__init__(path, **options)

        self.output_format = _TextOutputFormat(
            path_util.to_abs_local_path(path.rstrip("/")), **options)
示例#12
0
    def __init__(self, path, **options):
        super(TextFile, self).__init__(path, **options)

        self.output_format = _TextOutputFormat(
                path_util.to_abs_local_path(path.rstrip("/")), **options)