Пример #1
0
    def add_file(self, file_path, resource_path=None, executable=False):
        """
        Add a single file to be packed with the job at runtime.

        Args:
          file_path (str):  path of file to add
          resource_path (str):  path of file at runtime
          executable (bool): if True, the file can be executed

        Raises:
          ValueError:  If invalid arguments are given
        """
        if resource_path is None:
            resource_path = os.path.basename(file_path)

        if not isinstance(file_path, str) or not isinstance(resource_path, str):
            raise ValueError("Invalid input path: must be str")

        # TODO(wangcong09): warn users if they specified different file names
        if resource_path in self.__file_targets:
            logger.warn("add [%s] duplicated" % resource_path)
            return

        self.__files.append((file_path, resource_path, executable))
        self.__file_targets.add(resource_path)
Пример #2
0
    def _set_sys_defaultencoding(self, will_pass_encoding=None):
        """
            Pass sys default encoding to remote side.
        """
        import sys
        if will_pass_encoding is None:
            """
            By default,
            if user has reloaded module "sys", we should pass defaultencoding to the remote side.
            Because "reload(sys)" and "sys.setdefaultencoding(encoding)" are most probably related.

            TODO(zhangyuncong):
                We should have a further discussion about
                whether we should pass the encoding by default or not.
            """
            will_pass_encoding = hasattr(sys, 'setdefaultencoding')

        if will_pass_encoding:
            default_encoding = sys.getdefaultencoding()
            logger.warn("pass defaultencoding %s to the remote side" % default_encoding)
            def set_default_encoding():
                import sys
                reload(sys)
                sys.setdefaultencoding(default_encoding)

            # \0 makes sure this hook will run before user's
            self.set_init_hook('\0set_sys_defaultencoding', set_default_encoding)
Пример #3
0
    def _set_sys_defaultencoding(self, will_pass_encoding=None):
        """
            Pass sys default encoding to remote side.
        """
        import sys
        if will_pass_encoding is None:
            """
            By default,
            if user has reloaded module "sys", we should pass defaultencoding to the remote side.
            Because "reload(sys)" and "sys.setdefaultencoding(encoding)" are most probably related.

            TODO(zhangyuncong):
                We should have a further discussion about
                whether we should pass the encoding by default or not.
            """
            will_pass_encoding = hasattr(sys, 'setdefaultencoding')

        if will_pass_encoding:
            default_encoding = sys.getdefaultencoding()
            logger.warn("pass defaultencoding %s to the remote side" %
                        default_encoding)

            def set_default_encoding():
                import sys
                reload(sys)
                sys.setdefaultencoding(default_encoding)

            # \0 makes sure this hook will run before user's
            self.set_init_hook('\0set_sys_defaultencoding',
                               set_default_encoding)
Пример #4
0
        def wrapper(test_class_obj):
            """ inner """
            _first_run = True
            if test_class_obj.pipeline_type in modes:
                for filesystem in expect_filesystems:
                    if filesystem in test_class_obj.support_file_system:
                        if _skip_filesystem_test(filesystem):
                            continue
                        test_class_obj.root_path = test_class_obj.root_path_dict[
                            filesystem]
                        test_class_obj.running_on_filesystem = filesystem
                        logger.info(
                            "running case [%s.%s] root_path=[%s], filesystem=[%s]"
                            % (type(test_class_obj).__name__, fn.func_name,
                               test_class_obj.root_path,
                               test_class_obj.running_on_filesystem))

                        if not _first_run:
                            test_class_obj.tearDown()
                        test_class_obj.setUp()
                        fn(test_class_obj)
                        _first_run = False
                    else:
                        logger.warn('\033[01;31mWarning!!! %s not executed,' \
                                ' because filesystem is %s.\033[00m' \
                                %(fn.__name__, filesystem))
Пример #5
0
    def add_dynamic_library(self, file_path):
        """
        Add a dynamic library file(.so) to be packed with the job and set it to LD_LIBRARY_PATH
        at runtime.

        Args:
          file_path:  Path of the library file
        """
        path = os.path.abspath(file_path)
        file_name = os.path.basename(path)
        if file_name in self.__library_targets:
            logger.warn("add [%s] duplicated" % file_path)
            return

        self.__libraries.append((file_name, path))
        self.__library_targets.add(file_name)
Пример #6
0
    def add_file(self, file_path, resource_path=None, executable=False):
        """
        向Pipeline添加单个文件,使得该文件能够在运行期被访问

        Args:
          file_path(str): 需要添加的文件路径,支持本地, HDFS 路径
          resource_path (str): 远端运行时访问该文件的本地路径, 应是相对路径. 也即在远端, file_path 将会被映射
                               成该 resource_path 路径, 用户程序可以直接用该路径访问到 file_path 对应的文件
          executable (bool): 若为True,则该文件在运行期会被添加可执行属性
        """
        if path_util.is_hdfs_path(file_path.lower()):
            if executable:
                logger.warn("Set executable for cache file is not supported yet, "
                            "ignore executable property")
            self.__append_cache_file(file_path, resource_path, executable)
        else:
            self._resource.add_file(file_path, resource_path, executable)
Пример #7
0
    def add_file(self, file_path, resource_path=None, executable=False):
        """
        向Pipeline添加单个文件,使得该文件能够在运行期被访问

        Args:
          file_path(str): 需要添加的文件路径,支持本地, HDFS 路径
          resource_path (str): 远端运行时访问该文件的本地路径, 应是相对路径. 也即在远端, file_path 将会被映射
                               成该 resource_path 路径, 用户程序可以直接用该路径访问到 file_path 对应的文件
          executable (bool): 若为True,则该文件在运行期会被添加可执行属性
        """
        if path_util.is_hdfs_path(file_path.lower()):
            if executable:
                logger.warn(
                    "Set executable for cache file is not supported yet, "
                    "ignore executable property")
            self.__append_cache_file(file_path, resource_path, executable)
        else:
            self._resource.add_file(file_path, resource_path, executable)
Пример #8
0
    def add_file_from_bytes(self, source_bytes, resource_path=None):
        """
        Add a single file to be packed with the job at runtime.

        Args:
          source_bytes (str):  the source binaries
          resource_path (str):  path of file at runtime
          executable (bool): if True, the file can be executed

        Raises:
          ValueError:  If invalid arguments are given
        """
        if not isinstance(source_bytes, str) or not isinstance(resource_path, str):
            raise ValueError("Invalid source bytes: must be str")

        if resource_path in self.__file_targets:
            logger.warn("add [%s] duplicated" % resource_path)
            return

        self.__binary_files.append((resource_path, source_bytes))
        self.__file_targets.add(resource_path)
Пример #9
0
    def add_egg_file(self, file_path):
        """
        Add an .egg file to be packed with the job and set its path to PYTHONPATH at runtime

        Args:
          file_path (str):  path of .egg file

        Raises:
          ValueError:  If invalid arguments are given
        """
        if not isinstance(file_path, str):
            raise ValueError("Invalid input path: must be str")

        import sys
        import traceback
        path = os.path.abspath(file_path)
        file_name = os.path.basename(path)
        if file_name in self.__egg_file_targets:
            logger.warn("add [%s] duplicated" % file_name)
            return

        self.__egg_files.append((file_name, path))
        self.__egg_file_targets.add(file_name)
Пример #10
0
        def wrapper(test_class_obj):
            """ inner """
            _first_run = True
            if test_class_obj.pipeline_type in modes:
                for filesystem in expect_filesystems:
                    if filesystem in test_class_obj.support_file_system:
                        if _skip_filesystem_test(filesystem):
                            continue
                        test_class_obj.root_path = test_class_obj.root_path_dict[filesystem]
                        test_class_obj.running_on_filesystem = filesystem
                        logger.info("running case [%s.%s] root_path=[%s], filesystem=[%s]" %
                                    (type(test_class_obj).__name__, fn.func_name, test_class_obj.root_path,
                                     test_class_obj.running_on_filesystem))

                        if not _first_run:
                            test_class_obj.tearDown()
                        test_class_obj.setUp()
                        fn(test_class_obj)
                        _first_run = False
                    else:
                        logger.warn('\033[01;31mWarning!!! %s not executed,' \
                                ' because filesystem is %s.\033[00m' \
                                %(fn.__name__, filesystem))
Пример #11
0
    def __init__(self, **pipeline_options):
        super(SparkPipeline, self).__init__(**pipeline_options)
        self._type_str = "SPARK"
        self._local_uri_infos = []
        self._default_job_name = self._get_default_job_name()

        if "hadoop_config_path" in pipeline_options:
            pipeline_options["hadoop_config_path"] = os.path.abspath(
                pipeline_options["hadoop_config_path"])
        if "hadoop_client_path" in pipeline_options:
            pipeline_options["hadoop_client_path"] = os.path.abspath(
                pipeline_options["hadoop_client_path"])
        if "spark_home_path" in pipeline_options:
            pipeline_options["spark_home_path"] = os.path.abspath(
                pipeline_options["spark_home_path"])

        class _DelayParam(object):
            """ inner function"""
            def __init__(self, fn):
                """ inner function"""
                self.__fn = fn

            def get(self):
                """ inner function"""
                return self.__fn()

        # config as pb message
        self._job_config = config_pb2.PbSparkConfig()

        def _get_reprepare_cache_archive():
            reprepare = os.getenv('BIGFLOW_REPREPARE_CACHE_ARCHIVE')
            if not reprepare:
                reprepare = False
            elif 'true' == reprepare.lower():
                reprepare = True
            else:
                reprepare = False
            return reprepare

        from bigflow import serde
        # config as dict
        self._default_spark_conf = {
            "spark.app.name": self._default_job_name,
            "spark.master": "yarn",
            "spark.local.dir": ".bigflow.on.spark",
            "spark.executor.extraClassPath": "spark_launcher.jar",
            "spark.executorEnv.PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION":
            "python",
            "spark.hadoop.fs.file.impl":
            "org.apache.hadoop.fs.LocalFileSystem",
            #"spark.hadoop.fs.hdfs.impl": "org.apache.hadoop.fs.DFileSystem",
        }

        default_merging_spark_conf = {
            # (key, value, separator, prepend)
            ("spark.executor.extraLibraryPath",
             ".:__bigflow_on_spark__/flume:__bigflow_on_spark__/python_runtime/lib",
             ":", True)
        }

        self._config = {
            'hadoop_config_path':
            _DelayParam(requests.default_hadoop_config_path),
            'hadoop_client_path':
            _DelayParam(requests.default_hadoop_client_path),
            'spark_home_path': _DelayParam(requests.default_spark_home_path),
            'default_serde': serde.DefaultSerde(),
            'spark_conf': {},
            'reprepare_cache_archive':
            _DelayParam(_get_reprepare_cache_archive),
            'bigflow_version': bigflow_version.bigflow_version,
            # now only support cpu profiling
            'cpu_profile': False,
            'heap_profile': False,
        }
        # update config by pipeline options
        self._config.update(pipeline_options)
        self._default_spark_conf.update(pipeline_options.get("spark_conf", {}))
        # merge spark configs which should not be simply replaced.
        for (k, v, sep, prepend) in default_merging_spark_conf:
            original_v = self._default_spark_conf.get(k)
            if original_v is None:
                self._default_spark_conf[k] = v
            else:
                merged_v = v + sep + original_v if prepend else original_v + sep + v
                self._default_spark_conf[k] = merged_v

        # Accept job_name as spark application name
        if self._config.get("job_name"):
            self._default_spark_conf["spark.app.name"] = self._config[
                "job_name"]
        self._config["spark_conf"] = self._default_spark_conf
        for key in self._config.keys():
            if isinstance(self._config[key], _DelayParam):
                self._config[key] = self._config[key].get()

        # check spark_home is set and valid
        spark_home_path = self._config["spark_home_path"]
        assert spark_home_path, "Spark home is not set, please specify spark home by " \
                                "Pipeline.create or by setting SPARK_HOME environment variable"
        assert os.path.isdir(spark_home_path), "Specified spark_home: %s is not a valid path, " \
                                               "" % spark_home_path
        # insert spark's core-site.xml over default hadoop client's config path unless explicitly
        # specified. I still think this's debatable,
        if not ("hadoop_config_path" in pipeline_options
                or "HADOOP_CONF_PATH" in os.environ):
            self._config["hadoop_config_path"] = os.path.join(
                spark_home_path, "conf/core-site.xml")

        for (k, v) in self._config['spark_conf'].items():
            kv = self._job_config.kv_config.add()
            kv.key = k
            kv.value = v

        # set cpu and heap profiling switch.
        self._job_config.cpu_profile = self._config.get("cpu_profile", False)
        self._job_config.heap_profile = self._config.get("heap_profile", False)

        self._job_config.hadoop_config_path = self._config[
            'hadoop_config_path']
        self._job_config.hadoop_client_path = self._config[
            'hadoop_client_path']
        self._job_config.spark_home_path = self._config['spark_home_path']

        if not 'tmp_data_path' in self._config.keys():
            err_msg = "Please set tmp_data_path to a writable HDFS dir" \
                + " when you use hadoop/dagmr pipeline to run Bigflow."
            logger.warn(err_msg)
            raise error.InvalidConfException(err_msg)
        if not self._config['tmp_data_path'].startswith('hdfs://'):
            self._config[
                'tmp_data_path'] = "hdfs://" + self._config['tmp_data_path']
            err_msg = "!!!!! Your tmp_data_path is not start with hdfs://, " \
                    + "so Bigflow set `hdfs://` by default. !!!!!"
            logger.warn(err_msg)
        self._config['tmp_data_path'] = os.path.join(
            self._config['tmp_data_path'], self._config['bigflow_version'])
        self._job_config.tmp_data_path = self._config['tmp_data_path']
        self.prepared_archive_path = self._config['tmp_data_path'] \
            + "/" + SparkPipeline.cache_archive_file_name
        self._job_config.prepared_archive_path = self.prepared_archive_path

        if 'default_concurrency' in self._config:
            self._job_config.default_concurrency = self._config[
                'default_concurrency']

        pb = pipeline_pb2.PbPipeline()
        pb.type = pipeline_pb2.PbPipeline.SPARK
        pb.spark_config.CopyFrom(self._job_config)

        requests.register_pipeline(pb, self.id())
        logger.debug("Register Pipeline %s OK" % self.id())

        self._pipeline_tmp_dir = os.path.join(self._job_config.tmp_data_path,
                                              'pipeline', self.id())
        self._local_exception_path = os.path.join('.tmp', self.id(),
                                                  'exception')
        self._exception_path = os.path.join(self._pipeline_tmp_dir,
                                            'exception_dir', 'exception')
        self._set_python_path_in_init_hooks()

        self._is_first_run = True
        self._client = hadoop_client.HadoopClient(
            self._job_config.hadoop_client_path,
            self._job_config.hadoop_config_path)
Пример #12
0
    def __init__(self, **pipeline_options):
        super(SparkPipeline, self).__init__(**pipeline_options)
        self._type_str = "SPARK"
        self._local_uri_infos = []
        self._default_job_name = self._get_default_job_name()

        if "hadoop_config_path" in pipeline_options:
            pipeline_options["hadoop_config_path"] = os.path.abspath(pipeline_options["hadoop_config_path"])
        if "hadoop_client_path" in pipeline_options:
            pipeline_options["hadoop_client_path"] = os.path.abspath(pipeline_options["hadoop_client_path"])
        if "spark_home_path" in pipeline_options:
            pipeline_options["spark_home_path"] = os.path.abspath(pipeline_options["spark_home_path"])

        class _DelayParam(object):
            """ inner function"""
            def __init__(self, fn):
                """ inner function"""
                self.__fn = fn

            def get(self):
                """ inner function"""
                return self.__fn()

        # config as pb message
        self._job_config = config_pb2.PbSparkConfig()

        def _get_reprepare_cache_archive():
            reprepare = os.getenv('BIGFLOW_REPREPARE_CACHE_ARCHIVE')
            if not reprepare:
                reprepare = False
            elif 'true' == reprepare.lower():
                reprepare = True
            else:
                reprepare = False
            return reprepare

        from bigflow import serde
        # config as dict
        self._default_spark_conf = {
            "spark.app.name": self._default_job_name,
            "spark.master": "yarn",
            "spark.local.dir": ".bigflow.on.spark",
            "spark.executor.extraClassPath": "spark_launcher.jar",
			"spark.executorEnv.PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION":"python",
            "spark.hadoop.fs.file.impl": "org.apache.hadoop.fs.LocalFileSystem",
            #"spark.hadoop.fs.hdfs.impl": "org.apache.hadoop.fs.DFileSystem",
        }

        default_merging_spark_conf = {
            # (key, value, separator, prepend)
            ("spark.executor.extraLibraryPath",
             ".:__bigflow_on_spark__/flume:__bigflow_on_spark__/python_runtime/lib", ":", True)
        }

        self._config = {
            'hadoop_config_path': _DelayParam(requests.default_hadoop_config_path),
            'hadoop_client_path': _DelayParam(requests.default_hadoop_client_path),
            'spark_home_path': _DelayParam(requests.default_spark_home_path),
            'default_serde': serde.DefaultSerde(),
            'spark_conf': {},
            'reprepare_cache_archive': _DelayParam(_get_reprepare_cache_archive),
            'bigflow_version': bigflow_version.bigflow_version,
            # now only support cpu profiling
            'cpu_profile': False,
            'heap_profile': False,
        }
        # update config by pipeline options
        self._config.update(pipeline_options)
        self._default_spark_conf.update(pipeline_options.get("spark_conf", {}))
        # merge spark configs which should not be simply replaced.
        for (k, v, sep, prepend) in default_merging_spark_conf:
            original_v = self._default_spark_conf.get(k)
            if original_v is None:
                self._default_spark_conf[k] = v
            else:
                merged_v = v + sep + original_v if prepend else original_v + sep + v
                self._default_spark_conf[k] = merged_v

        # Accept job_name as spark application name
        if self._config.get("job_name"):
            self._default_spark_conf["spark.app.name"] = self._config["job_name"]
        self._config["spark_conf"] = self._default_spark_conf
        for key in self._config.keys():
            if isinstance(self._config[key], _DelayParam):
                self._config[key] = self._config[key].get()

        # check spark_home is set and valid
        spark_home_path = self._config["spark_home_path"]
        assert spark_home_path, "Spark home is not set, please specify spark home by " \
                                "Pipeline.create or by setting SPARK_HOME environment variable"
        assert os.path.isdir(spark_home_path), "Specified spark_home: %s is not a valid path, " \
                                               "" % spark_home_path
        # insert spark's core-site.xml over default hadoop client's config path unless explicitly
        # specified. I still think this's debatable,
        if not ("hadoop_config_path" in pipeline_options or "HADOOP_CONF_PATH" in os.environ):
            self._config["hadoop_config_path"] = os.path.join(spark_home_path, "conf/core-site.xml")

        for (k, v) in self._config['spark_conf'].items():
            kv = self._job_config.kv_config.add()
            kv.key = k
            kv.value = v

        # set cpu and heap profiling switch.
        self._job_config.cpu_profile = self._config.get("cpu_profile", False)
        self._job_config.heap_profile = self._config.get("heap_profile", False)

        self._job_config.hadoop_config_path = self._config['hadoop_config_path']
        self._job_config.hadoop_client_path = self._config['hadoop_client_path']
        self._job_config.spark_home_path = self._config['spark_home_path']

        if not 'tmp_data_path' in self._config.keys():
            err_msg = "Please set tmp_data_path to a writable HDFS dir" \
                + " when you use hadoop/dagmr pipeline to run Bigflow."
            logger.warn(err_msg)
            raise error.InvalidConfException(err_msg)
        if not self._config['tmp_data_path'].startswith('hdfs://'):
            self._config['tmp_data_path'] = "hdfs://" + self._config['tmp_data_path']
            err_msg = "!!!!! Your tmp_data_path is not start with hdfs://, " \
                    + "so Bigflow set `hdfs://` by default. !!!!!"
            logger.warn(err_msg)
        self._config['tmp_data_path'] = os.path.join(
            self._config['tmp_data_path'],
            self._config['bigflow_version']
        )
        self._job_config.tmp_data_path = self._config['tmp_data_path']
        self.prepared_archive_path = self._config['tmp_data_path'] \
            + "/" + SparkPipeline.cache_archive_file_name
        self._job_config.prepared_archive_path = self.prepared_archive_path

        if 'default_concurrency' in self._config:
            self._job_config.default_concurrency = self._config['default_concurrency']

        pb = pipeline_pb2.PbPipeline()
        pb.type = pipeline_pb2.PbPipeline.SPARK
        pb.spark_config.CopyFrom(self._job_config)

        requests.register_pipeline(pb, self.id())
        logger.debug("Register Pipeline %s OK" % self.id())

        self._pipeline_tmp_dir = os.path.join(self._job_config.tmp_data_path, 'pipeline', self.id())
        self._local_exception_path = os.path.join('.tmp', self.id(), 'exception')
        self._exception_path = os.path.join(self._pipeline_tmp_dir, 'exception_dir', 'exception')
        self._set_python_path_in_init_hooks()

        self._is_first_run = True
        self._client = hadoop_client.HadoopClient(self._job_config.hadoop_client_path,
                                                  self._job_config.hadoop_config_path)