def _create_sc(self, submit_args, conf): os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args zoo_conf = init_spark_conf(conf) sc = init_nncontext(conf=zoo_conf, spark_log_level=self.spark_log_level, redirect_spark_log=self.redirect_spark_log) return sc
def _create_sc(self, submit_args, conf): from pyspark.sql import SparkSession print("pyspark_submit_args is: {}".format(submit_args)) os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args zoo_conf = init_spark_conf(conf) sc = init_nncontext(conf=zoo_conf, redirect_spark_log=self.redirect_spark_log) sc.setLogLevel(self.spark_log_level) return sc
def init_spark_on_local(self, cores, conf=None, python_location=None): print("Start to getOrCreate SparkContext") os.environ['PYSPARK_PYTHON'] = \ python_location if python_location else self._detect_python_location() master = "local[{}]".format(cores) zoo_conf = init_spark_conf(conf).setMaster(master) sc = init_nncontext(conf=zoo_conf, redirect_spark_log=self.redirect_spark_log) sc.setLogLevel(self.spark_log_level) print("Successfully got a SparkContext") return sc
def init_spark_standalone(self, num_executors, executor_cores, executor_memory="10g", driver_memory="1g", driver_cores=4, master=None, extra_executor_memory_for_ray=None, extra_python_lib=None, conf=None, jars=None): import subprocess import pyspark from zoo.util.utils import get_node_ip from zoo.util.engine import get_analytics_zoo_classpath from bigdl.util.engine import get_bigdl_classpath if 'PYSPARK_PYTHON' not in os.environ: os.environ["PYSPARK_PYTHON"] = self._detect_python_location() if not master: pyspark_home = os.path.abspath(pyspark.__file__ + "/../") zoo_standalone_home = os.path.abspath( __file__ + "/../../share/bin/standalone") node_ip = get_node_ip() SparkRunner.standalone_env = { "SPARK_HOME": pyspark_home, "ZOO_STANDALONE_HOME": zoo_standalone_home, # If not set this, by default master is hostname but not ip, "SPARK_MASTER_HOST": node_ip } # The scripts installed from pip don't have execution permission # and need to first give them permission. pro = subprocess.Popen( ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)]) os.waitpid(pro.pid, 0) # Start master start_master_pro = subprocess.Popen( "{}/sbin/start-master.sh".format(zoo_standalone_home), shell=True, env=SparkRunner.standalone_env) os.waitpid(start_master_pro.pid, 0) master = "spark://{}:7077".format( node_ip) # 7077 is the default port # Start worker start_worker_pro = subprocess.Popen( "{}/sbin/start-worker.sh {}".format(zoo_standalone_home, master), shell=True, env=SparkRunner.standalone_env) os.waitpid(start_worker_pro.pid, 0) else: # A Spark standalone cluster has already been started by the user. assert master.startswith("spark://"), \ "Please input a valid master address for your Spark standalone cluster: " \ "spark://master:port" # Start pyspark-shell submit_args = " --master " + master submit_args = submit_args + " --driver-cores {} --driver-memory {} --num-executors {}" \ " --executor-cores {} --executor-memory {}"\ .format(driver_cores, driver_memory, num_executors, executor_cores, executor_memory) if extra_python_lib: submit_args = submit_args + " --py-files {}".format( extra_python_lib) if jars: submit_args = submit_args + " --jars {}".format(jars) submit_args = submit_args + " pyspark-shell" os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args zoo_bigdl_jar_path = ":".join( [get_analytics_zoo_classpath(), get_bigdl_classpath()]) spark_conf = init_spark_conf(conf) \ .set("spark.driver.cores", driver_cores) \ .set("spark.driver.memory", driver_memory) \ .set("spark.executor.instances", num_executors) \ .set("spark.executor.cores", executor_cores) \ .set("spark.cores.max", num_executors * executor_cores) \ .set("spark.executorEnv.PYTHONHOME", "/".join(self._detect_python_location().split("/")[:-2])) if extra_executor_memory_for_ray: spark_conf.set("spark.executor.memoryOverhead", extra_executor_memory_for_ray) if spark_conf.contains("spark.executor.extraClassPath"): spark_conf.set( "spark.executor.extraClassPath", "{}:{}".format(zoo_bigdl_jar_path, conf.get("spark.executor.extraClassPath"))) else: spark_conf.set("spark.executor.extraClassPath", zoo_bigdl_jar_path) sc = init_nncontext(spark_conf, redirect_spark_log=self.redirect_spark_log) sc.setLogLevel(self.spark_log_level) return sc
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from bigdl.optim.optimizer import Adam from keras.datasets import imdb from keras.preprocessing import sequence from zoo.pipeline.api.keras.models import Model from zoo.pipeline.api.keras.layers import * from zoo.pipeline.api.autograd import * from zoo.common.nncontext import init_spark_conf from zoo.common.nncontext import init_nncontext conf = init_spark_conf() conf.set("spark.executor.extraJavaOptions", "-Xss512m") conf.set("spark.driver.extraJavaOptions", "-Xss512m") sc = init_nncontext(conf) max_features = 20000 max_len = 200 print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) print('x_train shape:', x_train.shape)