Пример #1
0
class Configuration(object):
    __bigdl_jars = [get_bigdl_classpath()]

    @staticmethod
    def add_extra_jars(jars):
        """
        Add extra jars to classpath
        :param jars: a string or a list of strings as jar paths
        """
        import six
        if isinstance(jars, six.string_types):
            jars = [jars]
        Configuration.__bigdl_jars += jars

    @staticmethod
    def add_extra_python_modules(packages):
        """
        Add extra python modules to sys.path
        :param packages: a string or a list of strings as python package paths
        """
        import six
        if isinstance(packages, six.string_types):
            packages = [packages]
        for package in packages:
            sys.path.insert(0, package)

    @staticmethod
    def get_bigdl_jars():
        return Configuration.__bigdl_jars
Пример #2
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if not is_spark_below_2_2():
        extend_spark_driver_cp(sparkConf, get_bigdl_classpath())
    return sparkConf
Пример #3
0
def get_zoo_bigdl_classpath_on_driver():
    from bigdl.util.engine import get_bigdl_classpath
    from zoo.util.engine import get_analytics_zoo_classpath
    bigdl_classpath = get_bigdl_classpath()
    assert bigdl_classpath, "Cannot find BigDL classpath, please check your installation"
    zoo_classpath = get_analytics_zoo_classpath()
    assert zoo_classpath, "Cannot find Analytics-Zoo classpath, please check your installation"
    return zoo_classpath, bigdl_classpath
Пример #4
0
    def try_copy_bigdl_jar(self):
        try:
            from bigdl.util.engine import get_bigdl_classpath
            shutil.copyfile(get_bigdl_classpath(), self.bigdl_jar)

        except Exception:
            print("WARNING: if you are running Cluster Serving using pip, you have misconfig"
                  "with bigdl python package, otherwise, ignore this WARNING.")
Пример #5
0
    def try_copy_bigdl_jar(self):
        try:
            from bigdl.util.engine import get_bigdl_classpath
            bigdl_jar_src = get_bigdl_classpath()
            if bigdl_jar_src == "":
                raise Exception("BigDL jar not discovered.")
            shutil.copyfile(bigdl_jar_src, self.bigdl_jar)
            print("BigDL jar copied from ", bigdl_jar_src)

        except Exception:
            print(
                "WARNING: if you are running Cluster Serving using pip, you have misconfig"
                "with bigdl python package, otherwise, ignore this WARNING.")
Пример #6
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if not is_spark_below_2_2():
        extend_spark_driver_cp(sparkConf, get_bigdl_classpath())

    # add content in PYSPARK_FILES in spark.submit.pyFiles
    # This is a workaround for current Spark on k8s
    python_lib = os.environ.get('PYSPARK_FILES', None)
    if python_lib:
        existing_py_files = sparkConf.get("spark.submit.pyFiles")
        if existing_py_files:
            sparkConf.set(key="spark.submit.pyFiles",
                          value="%s,%s" % (python_lib, existing_py_files))
        else:
            sparkConf.set(key="spark.submit.pyFiles", value=python_lib)

    return sparkConf
Пример #7
0
 def _get_bigdl_jar_name_on_driver(self):
     from bigdl.util.engine import get_bigdl_classpath
     bigdl_classpath = get_bigdl_classpath()
     assert bigdl_classpath, "Cannot find bigdl classpath"
     return bigdl_classpath.split("/")[-1]
Пример #8
0
    def init_spark_standalone(self,
                              num_executors,
                              executor_cores,
                              executor_memory="10g",
                              driver_memory="1g",
                              driver_cores=4,
                              master=None,
                              extra_executor_memory_for_ray=None,
                              extra_python_lib=None,
                              conf=None,
                              jars=None):
        import subprocess
        import pyspark
        from zoo.util.utils import get_node_ip
        from zoo.util.engine import get_analytics_zoo_classpath
        from bigdl.util.engine import get_bigdl_classpath

        if 'PYSPARK_PYTHON' not in os.environ:
            os.environ["PYSPARK_PYTHON"] = self._detect_python_location()
        if not master:
            pyspark_home = os.path.abspath(pyspark.__file__ + "/../")
            zoo_standalone_home = os.path.abspath(
                __file__ + "/../../share/bin/standalone")
            node_ip = get_node_ip()
            SparkRunner.standalone_env = {
                "SPARK_HOME": pyspark_home,
                "ZOO_STANDALONE_HOME": zoo_standalone_home,
                # If not set this, by default master is hostname but not ip,
                "SPARK_MASTER_HOST": node_ip
            }
            # The scripts installed from pip don't have execution permission
            # and need to first give them permission.
            pro = subprocess.Popen(
                ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)])
            os.waitpid(pro.pid, 0)
            # Start master
            start_master_pro = subprocess.Popen(
                "{}/sbin/start-master.sh".format(zoo_standalone_home),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_master_pro.pid, 0)
            master = "spark://{}:7077".format(
                node_ip)  # 7077 is the default port
            # Start worker
            start_worker_pro = subprocess.Popen(
                "{}/sbin/start-worker.sh {}".format(zoo_standalone_home,
                                                    master),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_worker_pro.pid, 0)
        else:  # A Spark standalone cluster has already been started by the user.
            assert master.startswith("spark://"), \
                "Please input a valid master address for your Spark standalone cluster: " \
                "spark://master:port"

        # Start pyspark-shell
        submit_args = " --master " + master
        submit_args = submit_args + " --driver-cores {} --driver-memory {} --num-executors {}" \
                                    " --executor-cores {} --executor-memory {}"\
            .format(driver_cores, driver_memory, num_executors, executor_cores, executor_memory)
        if extra_python_lib:
            submit_args = submit_args + " --py-files {}".format(
                extra_python_lib)
        if jars:
            submit_args = submit_args + " --jars {}".format(jars)
        submit_args = submit_args + " pyspark-shell"
        os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args

        zoo_bigdl_jar_path = ":".join(
            [get_analytics_zoo_classpath(),
             get_bigdl_classpath()])
        spark_conf = init_spark_conf(conf) \
            .set("spark.driver.cores", driver_cores) \
            .set("spark.driver.memory", driver_memory) \
            .set("spark.executor.instances", num_executors) \
            .set("spark.executor.cores", executor_cores) \
            .set("spark.cores.max", num_executors * executor_cores) \
            .set("spark.executorEnv.PYTHONHOME",
                 "/".join(self._detect_python_location().split("/")[:-2]))
        if extra_executor_memory_for_ray:
            spark_conf.set("spark.executor.memoryOverhead",
                           extra_executor_memory_for_ray)
        if spark_conf.contains("spark.executor.extraClassPath"):
            spark_conf.set(
                "spark.executor.extraClassPath",
                "{}:{}".format(zoo_bigdl_jar_path,
                               conf.get("spark.executor.extraClassPath")))
        else:
            spark_conf.set("spark.executor.extraClassPath", zoo_bigdl_jar_path)

        sc = init_nncontext(spark_conf,
                            redirect_spark_log=self.redirect_spark_log)
        sc.setLogLevel(self.spark_log_level)
        return sc