예제 #1
0
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from py4j.java_gateway import java_import, JavaGateway, GatewayClient

# start JVM gateway
client = GatewayClient(address='127.0.0.1', port=${JVM_GATEWAY_PORT})
gateway = JavaGateway(client)
java_import(gateway.jvm, "org.apache.zeppelin.display.Input")
intp = gateway.entry_point
예제 #2
0
from py4j.java_gateway import java_import, JavaGateway, GatewayClient
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

# for back compatibility
from pyspark.sql import SQLContext

# start JVM gateway
if "PY4J_GATEWAY_SECRET" in os.environ:
    from py4j.java_gateway import GatewayParameters
    gateway_secret = os.environ["PY4J_GATEWAY_SECRET"]
    gateway = JavaGateway(gateway_parameters=GatewayParameters(address="${JVM_GATEWAY_ADDRESS}",
        port=${JVM_GATEWAY_PORT}, auth_token=gateway_secret, auto_convert=True))
else:
    gateway = JavaGateway(GatewayClient(address="${JVM_GATEWAY_ADDRESS}", port=${JVM_GATEWAY_PORT}), auto_convert=True)

java_import(gateway.jvm, "org.apache.spark.SparkEnv")
java_import(gateway.jvm, "org.apache.spark.SparkConf")
java_import(gateway.jvm, "org.apache.spark.api.java.*")
java_import(gateway.jvm, "org.apache.spark.api.python.*")
java_import(gateway.jvm, "org.apache.spark.ml.python.*")
java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
java_import(gateway.jvm, "org.apache.spark.resource.*")

intp = gateway.entry_point

if intp.isSpark3():
    warnings.filterwarnings(action='ignore', module='pyspark.util')

jsc = intp.getJavaSparkContext()
예제 #3
0
                    completionList.add(completionItem)
        if len(completionList) <= 0:
            self.interpreterObject.setStatementsFinished("", False)
        else:
            result = json.dumps(
                list(
                    filter(lambda x: not re.match("^__.*", x),
                           list(completionList))))
            self.interpreterObject.setStatementsFinished(result, False)


output = Logger()
sys.stdout = output
sys.stderr = output

client = GatewayClient(port=int(sys.argv[1]))
sparkVersion = SparkVersion(int(sys.argv[2]))

if sparkVersion.isSpark2():
    from pyspark.sql import SparkSession
else:
    from pyspark.sql import SchemaRDD

if sparkVersion.isAutoConvertEnabled():
    gateway = JavaGateway(client, auto_convert=True)
else:
    gateway = JavaGateway(client)

java_import(gateway.jvm, "org.apache.spark.SparkEnv")
java_import(gateway.jvm, "org.apache.spark.SparkConf")
java_import(gateway.jvm, "org.apache.spark.api.java.*")
예제 #4
0
 def setUp(self):
     self.p = start_example_app_process()
     gateway_client = GatewayClient()
     self.gateway = JavaGateway()
     self.gateway.set_gateway_client(gateway_client)
예제 #5
0
    """
    Import a python class where its identity is not known until runtime.
    :param cls: The fully qualified path of the class including module
    prefixes, e.g. sparkjobserver.api.SparkJob
    :return: The constructor for the class, as a function which can be
    called to instantiate an instance.
    """
    (module_name, class_name) = cls.rsplit('.', 1)
    module = import_module(module_name)
    c = getattr(module, class_name)
    return c


if __name__ == "__main__":
    port = int(sys.argv[1])
    gateway = JavaGateway(GatewayClient(port=port), auto_convert=True)
    entry_point = gateway.entry_point
    imports = entry_point.getPy4JImports()
    for i in imports:
        java_import(gateway.jvm, i)

    context_config =\
        ConfigFactory.parse_string(entry_point.contextConfigAsHocon())
    job_id = entry_point.jobId()
    job_env = JobEnvironment(job_id, None, context_config)
    job_config = ConfigFactory.parse_string(entry_point.jobConfigAsHocon())
    job_class = import_class(entry_point.jobClass())
    job = job_class()

    jcontext = entry_point.context()
    jspark_conf = entry_point.sparkConf()
예제 #6
0
def launch_gateway():
    SPARK_HOME = os.environ["SPARK_HOME"]

    gateway_port = -1
    if "PYSPARK_GATEWAY_PORT" in os.environ:
        gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
    else:
        # Launch the Py4j gateway using Spark's run command so that we pick up the
        # proper classpath and settings from spark-env.sh
        on_windows = platform.system() == "Windows"
        script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
        submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS")
        submit_args = submit_args if submit_args is not None else ""
        submit_args = shlex.split(submit_args)
        command = [os.path.join(SPARK_HOME, script)] + submit_args + ["pyspark-shell"]
        if not on_windows:
            # Don't send ctrl-c / SIGINT to the Java gateway:
            def preexec_func():
                signal.signal(signal.SIGINT, signal.SIG_IGN)
            proc = Popen(command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func)
        else:
            # preexec_fn not supported on Windows
            proc = Popen(command, stdout=PIPE, stdin=PIPE)

        try:
            # Determine which ephemeral port the server started on:
            gateway_port = proc.stdout.readline()
            gateway_port = int(gateway_port)
        except ValueError:
            # Grab the remaining lines of stdout
            (stdout, _) = proc.communicate()
            exit_code = proc.poll()
            error_msg = "Launching GatewayServer failed"
            error_msg += " with exit code %d!\n" % exit_code if exit_code else "!\n"
            error_msg += "Warning: Expected GatewayServer to output a port, but found "
            if gateway_port == "" and stdout == "":
                error_msg += "no output.\n"
            else:
                error_msg += "the following:\n\n"
                error_msg += "--------------------------------------------------------------\n"
                error_msg += gateway_port + stdout
                error_msg += "--------------------------------------------------------------\n"
            raise Exception(error_msg)

        # In Windows, ensure the Java child processes do not linger after Python has exited.
        # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when
        # the parent process' stdin sends an EOF). In Windows, however, this is not possible
        # because java.lang.Process reads directly from the parent process' stdin, contending
        # with any opportunity to read an EOF from the parent. Note that this is only best
        # effort and will not take effect if the python process is violently terminated.
        if on_windows:
            # In Windows, the child process here is "spark-submit.cmd", not the JVM itself
            # (because the UNIX "exec" command is not available). This means we cannot simply
            # call proc.kill(), which kills only the "spark-submit.cmd" process but not the
            # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all
            # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx)
            def killChild():
                Popen(["cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid)])
            atexit.register(killChild)

        # Create a thread to echo output from the GatewayServer, which is required
        # for Java log output to show up:
        class EchoOutputThread(Thread):

            def __init__(self, stream):
                Thread.__init__(self)
                self.daemon = True
                self.stream = stream

            def run(self):
                while True:
                    line = self.stream.readline()
                    sys.stderr.write(line)
        EchoOutputThread(proc.stdout).start()

    # Connect to the gateway
    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)

    # Import the classes used by PySpark
    java_import(gateway.jvm, "org.apache.spark.SparkConf")
    java_import(gateway.jvm, "org.apache.spark.api.java.*")
    java_import(gateway.jvm, "org.apache.spark.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
    java_import(gateway.jvm, "scala.Tuple2")

    return gateway
예제 #7
0
import ast

from time import sleep
from py4j.java_gateway import java_import, JavaGateway, GatewayClient
from py4j.protocol import Py4JNetworkError

from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext

import logging
logging.basicConfig(filename='logs/python.log', level=logging.INFO)
logging.info('Starting python module for sparkle notebook')

# Connect to the gateway
gateway = JavaGateway(GatewayClient(port=int(sys.argv[1])), auto_convert=True)

# Import the classes used by PySpark
java_import(gateway.jvm, "org.apache.spark.SparkConf")
java_import(gateway.jvm, "org.apache.spark.api.java.*")
java_import(gateway.jvm, "org.apache.spark.api.python.*")
java_import(gateway.jvm, "org.apache.spark.ml.python.*")
java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
# TODO(davies): move into sql
java_import(gateway.jvm, "org.apache.spark.sql.*")
java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
java_import(gateway.jvm, "scala.Tuple2")

python_kernel = gateway.entry_point
# auto generated variable counter
var_counter = 0
예제 #8
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import DataFrame
from py4j.java_gateway import JavaGateway, GatewayClient, java_import
from py4j.protocol import Py4JJavaError

# gateway_address and gateway_port are set in the kernel
gateway = JavaGateway(GatewayClient(address=gateway_address,
                                    port=gateway_port),
                      start_callback_server=False,
                      auto_convert=True)

java_spark_context = gateway.entry_point.getSparkContext()
java_spark_conf = gateway.entry_point.getSparkConf()

java_import(gateway.jvm, "org.apache.spark.SparkEnv")
java_import(gateway.jvm, "org.apache.spark.SparkConf")
java_import(gateway.jvm, "org.apache.spark.api.java.*")
java_import(gateway.jvm, "org.apache.spark.api.python.*")
java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
java_import(gateway.jvm, "org.apache.spark.sql.*")
java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
java_import(gateway.jvm, "scala.Tuple2")
java_import(gateway.jvm, "scala.collection.immutable.List")
예제 #9
0
    if replName is None:
      self.z.unregisterHook(event)
    else:
      self.z.unregisterHook(event, replName)

  def registerNoteHook(self, event, cmd, noteId, replName=None):
    if replName is None:
      self.z.registerNoteHook(event, cmd, noteId)
    else:
      self.z.registerNoteHook(event, cmd, noteId, replName)

  def unregisterNoteHook(self, event, noteId, replName=None):
    if replName is None:
      self.z.unregisterNoteHook(event, noteId)
    else:
      self.z.unregisterNoteHook(event, noteId, replName)

# start JVM gateway
if "PY4J_GATEWAY_SECRET" in os.environ:
  from py4j.java_gateway import GatewayParameters
  gateway_secret = os.environ["PY4J_GATEWAY_SECRET"]
  gateway = JavaGateway(gateway_parameters=GatewayParameters(
    port=${JVM_GATEWAY_PORT}, auth_token=gateway_secret, auto_convert=True))
else:
    gateway = JavaGateway(GatewayClient(port=${JVM_GATEWAY_PORT}), auto_convert=True)

java_import(gateway.jvm, "org.apache.zeppelin.display.Input")
intp = gateway.entry_point
z = __zeppelin__ = PyZeppelinContext(intp.getZeppelinContext())

예제 #10
0
    def get_departures(self):
        # set gzip header
        cherrypy.response.headers['Content-Type'] = 'application/gzip'
        # create the return tuple
        return_tuple = {}
        return_tuple['departures'] = []
        return_tuple['warning'] = ""
        return_tuple['error'] = ""
        translator = Translator(Config().get_param("default_language"))
        # parse json encoded input
        options = helper.convert_dict_values_to_utf8(cherrypy.request.json)

        # user language
        language = ""
        if options.has_key("language") == True:
            language = options['language']
        # if the user sends a language, which is not german, take the default language setting
        if language != "de":
            language = Config().get_param("default_language")
        # initialize the translator object with the user's choosen language
        translator = Translator(language)

        # check latitude, longitude and vehicles parameters
        try:
            lat = float(options['lat'])
        except KeyError as e:
            return_tuple['error'] = translator.translate(
                "message", "no_latitude_value")
            return helper.zip_data(return_tuple)
        except ValueError as e:
            return_tuple['error'] = translator.translate(
                "message", "no_latitude_value")
            return helper.zip_data(return_tuple)
        try:
            lon = float(options['lon'])
        except KeyError as e:
            return_tuple['error'] = translator.translate(
                "message", "no_longitude_value")
            return helper.zip_data(return_tuple)
        except ValueError as e:
            return_tuple['error'] = translator.translate(
                "message", "no_longitude_value")
            return helper.zip_data(return_tuple)
        try:
            vehicles = options['vehicles'].split("+")
        except KeyError as e:
            vehicles = []

        # get the nearest stations for this coordinates and take the first one
        gateway = JavaGateway(
            GatewayClient(port=Config().get_param("gateway_port")),
            auto_field=True)
        main_point = gateway.entry_point
        closest_stations_result = main_point.getNearestStations(
            geometry.convert_coordinate_to_int(lat),
            geometry.convert_coordinate_to_int(lon))
        if closest_stations_result.status.toString() == "INVALID_STATION":
            return_tuple['error'] = translator.translate(
                "message", "no_station_for_this_coordinates")
            return helper.zip_data(return_tuple)
        if closest_stations_result.status.toString() == "SERVICE_DOWN":
            return_tuple['error'] = translator.translate(
                "message", "bahn_server_down")
            return helper.zip_data(return_tuple)
        if closest_stations_result.locations == None or len(
                closest_stations_result.locations) == 0:
            return_tuple['error'] = translator.translate(
                "message", "no_station_for_this_coordinates")
            return helper.zip_data(return_tuple)

        # get departures for station
        sfinder = StationFinder(translator)
        station = sfinder.choose_station_by_vehicle_type(
            closest_stations_result.locations, lat, lon, vehicles)
        departures_result = main_point.getDepartures(station.id)
        date_format = gateway.jvm.java.text.SimpleDateFormat(
            "HH:mm", gateway.jvm.java.util.Locale.GERMAN)
        for station_departure in departures_result.stationDepartures:
            for departure in station_departure.departures:
                try:
                    dep_entry = {}
                    dep_entry['nr'] = "%s%s" % (departure.line.product.code,
                                                departure.line.label)
                    dep_entry['to'] = departure.destination.name
                    dep_entry['time'] = date_format.format(
                        departure.plannedTime)
                    # remaining time
                    duration = departure.plannedTime.getTime() / 1000 - int(
                        time.time())
                    minutes, seconds = divmod(duration, 60)
                    dep_entry['remaining'] = minutes
                    return_tuple['departures'].append(dep_entry)
                except Exception as e:
                    pass

        # convert return_tuple to json and zip it, before returning
        return helper.zip_data(return_tuple)
예제 #11
0
 def setUp(self):
     self.p = start_example_app_process()
     # This is to ensure that the server is started before connecting to it!
     time.sleep(1)
     gateway_client = GatewayClient()
     self.gateway = JavaGateway(gateway_client=gateway_client)
예제 #12
0
파일: core.py 프로젝트: jcrist/knit
    def start(self,
              cmd,
              num_containers=1,
              virtual_cores=1,
              memory=128,
              files=None,
              envvars=None,
              app_name="knit",
              queue="default",
              checks=True):
        """
        Method to start a yarn app with a distributed shell

        Parameters
        ----------
        cmd: str
            command to run in each yarn container
        num_containers: int
            Number of containers YARN should request (default: 1)
            * A container should be requested with the number of cores it can
              saturate, i.e.
            * the average number of threads it expects to have runnable at a
              time.
        virtual_cores: int
            Number of virtual cores per container (default: 1)
            * A node's capacity should be configured with virtual cores equal to
            * its number of physical cores.
        memory: int
            Memory per container (default: 128)
            * The unit for memory is megabytes.
        files: list
            list of files to be include in each container. If starting with
            `hdfs://`, assume these already exist in HDFS and don't need
            uploading. Otherwise, if hdfs3 is installed, existence of the
            file on HDFS will be checked to see if upload is needed.
            Files ending with `.zip` will be decompressed in the
            container before launch as a directory with the same name as the
            file: if myarc.zip contains files inside a directory stuff/, to
            the container they will appear at ./myarc.zip/stuff/* .
        envvars: dict
            Environment variables to pass to AM *and* workers. Both keys
            and values must be strings only.
        app_name: String
            Application name shown in YARN (default: "knit")
        queue: String
            RM Queue to use while scheduling (default: "default")
        checks: bool=True
            Whether to run pre-flight checks before submitting app to YARN

        Returns
        -------
        applicationId: str
            A yarn application ID string
        """
        files = files or []
        envvars = envvars or {'KNIT_LANG': self.lang}
        for k, v in envvars.items():
            if not isinstance(k, str) or not isinstance(v, str):
                raise ValueError('Environment must contain only strings (%s)' %
                                 ((k, v), ))
        if self.app_id:
            raise ValueError('Already started')
        if not isinstance(memory, int):
            raise KnitException("Memory argument must be an integer")
        if files:
            if not isinstance(files, list):
                raise KnitException("File argument must be a list of strings")

        if checks:
            self._pre_flight_checks(num_containers, virtual_cores, memory,
                                    files, queue)
        # From https://github.com/apache/spark/blob/d83c2f9f0b08d6d5d369d9fae04cdb15448e7f0d/python/pyspark/java_gateway.py
        # thank you spark

        ## Socket for PythonGatewayServer to communicate its port to us
        callback_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        callback_socket.bind(('127.0.0.1', 0))
        callback_socket.listen(1)
        callback_host, callback_port = callback_socket.getsockname()

        if not os.path.exists(self.JAR_FILE_PATH):
            raise KnitException('JAR file %s does not exists - please build'
                                ' with maven' % self.JAR_FILE_PATH)
        args = [
            "hadoop", "jar", self.JAR_FILE_PATH, self.JAVA_APP,
            "--callbackHost",
            str(callback_host), "--callbackPort",
            str(callback_port)
        ]

        ## Launch the Java gateway.
        # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
        if not on_windows:
            # Don't send ctrl-c / SIGINT to the Java gateway:
            def preexec_func():
                signal.signal(signal.SIGINT, signal.SIG_IGN)

            proc = Popen(args, stdin=PIPE, preexec_fn=preexec_func)
        else:
            # preexec_fn not supported on Windows
            proc = Popen(args, stdin=PIPE)
        self.proc = proc
        gateway_port = None
        # We use select() here in order to avoid blocking indefinitely if the
        # subprocess dies before connecting
        long_timeout = 60
        while gateway_port is None and proc.poll(
        ) is None and long_timeout > 0:
            timeout = 1  # (seconds)
            readable, _, _ = select.select([callback_socket], [], [], timeout)
            if callback_socket in readable:
                gateway_connection = callback_socket.accept()[0]
                # Determine which ephemeral port the server started on:
                gateway_port = read_int(gateway_connection.makefile(mode="rb"))
                gateway_connection.close()
                callback_socket.close()
            long_timeout -= 1

        if gateway_port is None:
            raise Exception(
                "The JVM Knit client failed to launch successfully."
                " Check that java is installed and the Knit JAR"
                " file exists.")

        gateway = JavaGateway(GatewayClient(port=gateway_port),
                              auto_convert=True)
        self.client = gateway.entry_point
        self.client_gateway = gateway
        logger.debug("Files submitted: %s" % files)
        upfiles = [
            f for f in files
            if (not f.startswith('hdfs://') and self.check_needs_upload(f))
        ]
        logger.debug("Files to upload: %s" % upfiles)
        jfiles = ListConverter().convert(upfiles, gateway._gateway_client)
        jenv = MapConverter().convert(envvars, gateway._gateway_client)

        self.app_id = self.client.start(jfiles, jenv, app_name, queue)

        ## Wait for AM to appear
        long_timeout = 100
        master_rpcport = -1
        while master_rpcport == -1:
            master_rpcport = self.client.masterRPCPort()
            time.sleep(0.2)
            long_timeout -= 0.2
            if long_timeout < 0:
                break

        if master_rpcport in [-1, 'N/A']:
            raise Exception(
                """The application master JVM process failed to report back. This can mean:
 - that the YARN cluster cannot scheduler adequate resources - check
   k.yarn_api.cluster_metrics() and other diagnostic methods;
 - that the ApplicationMaster crashed - check the application logs, k.logs();
 - that the cluster is otherwise unhealthy - check the RM and NN logs 
   (use k.yarn_api.system_logs() to find these on a one-node system
""")
        master_rpchost = self.client.masterRPCHost()

        gateway = JavaGateway(GatewayClient(address=master_rpchost,
                                            port=master_rpcport),
                              auto_convert=True)
        self.master = gateway.entry_point
        rfiles = [
            triple_slash(f) if f.startswith('hdfs://') else '/'.join(
                ['hdfs://', self.hdfs_home, '.knitDeps',
                 os.path.basename(f)]) for f in files
        ]
        logger.debug("Resource files: %s" % rfiles)
        jfiles = ListConverter().convert(rfiles, gateway._gateway_client)
        jenv = MapConverter().convert(envvars, gateway._gateway_client)
        self.master.init(jfiles, jenv, cmd, num_containers, virtual_cores,
                         memory)

        return self.app_id
예제 #13
0
            completionList = self.getMethodCompletion(objName, methodName)

        if completionList is None or len(completionList) <= 0:
            self.interpreter.setStatementsFinished("", False)
        else:
            result = json.dumps(
                list(
                    filter(lambda x: not re.match("^__.*", x),
                           list(completionList))))
            self.interpreter.setStatementsFinished(result, False)


host = sys.argv[1]
port = int(sys.argv[2])

client = GatewayClient(address=host, port=port)
gateway = JavaGateway(client, auto_convert=True)
intp = gateway.entry_point
# redirect stdout/stderr to java side so that PythonInterpreter can capture the python execution result
output = Logger()
sys.stdout = output
sys.stderr = output

_zcUserQueryNameSpace = {}

completion = PythonCompletion(intp, _zcUserQueryNameSpace)
_zcUserQueryNameSpace["__zeppelin_completion__"] = completion
_zcUserQueryNameSpace["gateway"] = gateway

from zeppelin_context import PyZeppelinContext
if intp.getZeppelinContext():
예제 #14
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


from py4j.java_gateway import java_import, JavaGateway, GatewayClient
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

# for back compatibility
from pyspark.sql import SQLContext

# start JVM gateway
client = GatewayClient(port=${JVM_GATEWAY_PORT})
gateway = JavaGateway(client, auto_convert=True)

java_import(gateway.jvm, "org.apache.spark.SparkEnv")
java_import(gateway.jvm, "org.apache.spark.SparkConf")
java_import(gateway.jvm, "org.apache.spark.api.java.*")
java_import(gateway.jvm, "org.apache.spark.api.python.*")
java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")

intp = gateway.entry_point
jsc = intp.getJavaSparkContext()

java_import(gateway.jvm, "org.apache.spark.sql.*")
java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
java_import(gateway.jvm, "scala.Tuple2")
예제 #15
0
def launch_gateway():
    SPARK_HOME = os.environ["SPARK_HOME"]

    gateway_port = -1
    if "PYSPARK_GATEWAY_PORT" in os.environ:
        gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
    else:
        # Launch the Py4j gateway using Spark's run command so that we pick up the
        # proper classpath and settings from spark-env.sh
        on_windows = platform.system() == "Windows"
        script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
        submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS")
        submit_args = submit_args if submit_args is not None else ""
        submit_args = shlex.split(submit_args)
        command = [os.path.join(SPARK_HOME, script)] + submit_args + ["pyspark-shell"]
        if not on_windows:
            # Don't send ctrl-c / SIGINT to the Java gateway:
            def preexec_func():
                signal.signal(signal.SIGINT, signal.SIG_IGN)
            proc = Popen(command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func)
        else:
            # preexec_fn not supported on Windows
            proc = Popen(command, stdout=PIPE, stdin=PIPE)

        try:
            # Determine which ephemeral port the server started on:
            gateway_port = proc.stdout.readline()
            gateway_port = int(gateway_port)
        except ValueError:
            # Grab the remaining lines of stdout
            (stdout, _) = proc.communicate()
            exit_code = proc.poll()
            error_msg = "Launching GatewayServer failed"
            error_msg += " with exit code %d!\n" % exit_code if exit_code else "!\n"
            error_msg += "Warning: Expected GatewayServer to output a port, but found "
            if gateway_port == "" and stdout == "":
                error_msg += "no output.\n"
            else:
                error_msg += "the following:\n\n"
                error_msg += "--------------------------------------------------------------\n"
                error_msg += gateway_port + stdout
                error_msg += "--------------------------------------------------------------\n"
            raise Exception(error_msg)

        # Create a thread to echo output from the GatewayServer, which is required
        # for Java log output to show up:
        class EchoOutputThread(Thread):

            def __init__(self, stream):
                Thread.__init__(self)
                self.daemon = True
                self.stream = stream

            def run(self):
                while True:
                    line = self.stream.readline()
                    sys.stderr.write(line)
        EchoOutputThread(proc.stdout).start()

    # Connect to the gateway
    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)

    # Import the classes used by PySpark
    java_import(gateway.jvm, "org.apache.spark.SparkConf")
    java_import(gateway.jvm, "org.apache.spark.api.java.*")
    java_import(gateway.jvm, "org.apache.spark.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
    java_import(gateway.jvm, "scala.Tuple2")

    return gateway
예제 #16
0
        if QMessageBox.question(None, '', 'Are you sure to quit?',
                                QMessageBox.Yes | QMessageBox.No, QMessageBox.No) == QMessageBox.Yes:
            QApplication.quit()

    def closeEvent(self, event):
        self.closing.emit()
        super(GameRacko, self).closeEvent(event)

if __name__ == "__main__":
    host = '127.0.0.1'
    port_number = 25333
    while port_number < 25335:
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.bind(('', 0))
        port_number = s.getsockname()[1]
        s.close()
    try:
        subprocess.Popen(['java', '-jar', 'RackoComputerPlayersGateway.jar', str(port_number)])
        time.sleep(1)
    except:
        sys.exit()

    gateway_server = JavaGateway(GatewayClient(address=host, port=port_number))
    app = QApplication(sys.argv)
    window = GameRacko(gateway_server)
    window.show()
    while app.exec_() > 0:
        time.sleep(1)
    gateway_server.shutdown()
    sys.exit()
예제 #17
0
def main():
    sys_stdin = sys.stdin
    sys_stdout = sys.stdout
    sys_stderr = sys.stderr

    if sys.version >= '3':
        sys.stdin = io.StringIO()
    else:
        sys.stdin = cStringIO.StringIO()

    sys.stdout = UnicodeDecodingStringIO()
    sys.stderr = UnicodeDecodingStringIO()

    spark_major_version = os.getenv("LIVY_SPARK_MAJOR_VERSION")
    try:
        listening_port = 0
        if os.environ.get("LIVY_TEST") != "true":
            #Load spark into the context
            exec('from pyspark.shell import sc', global_dict)
            exec('from pyspark.shell import sqlContext', global_dict)
            exec('from pyspark.sql import HiveContext', global_dict)
            exec('from pyspark.streaming import StreamingContext', global_dict)
            exec('import pyspark.cloudpickle as cloudpickle', global_dict)
            if spark_major_version >= "2":
                exec('from pyspark.shell import spark', global_dict)

            #Start py4j callback server
            from py4j.protocol import ENTRY_POINT_OBJECT_ID
            from py4j.java_gateway import JavaGateway, GatewayClient, CallbackServerParameters

            gateway_client_port = int(os.environ.get("PYSPARK_GATEWAY_PORT"))
            gateway = JavaGateway(GatewayClient(port=gateway_client_port))
            gateway.start_callback_server(
                callback_server_parameters=CallbackServerParameters(port=0))
            socket_info = gateway._callback_server.server_socket.getsockname()
            listening_port = socket_info[1]
            pyspark_job_processor = PySparkJobProcessorImpl()
            gateway.gateway_property.pool.dict[
                ENTRY_POINT_OBJECT_ID] = pyspark_job_processor

            global local_tmp_dir_path, job_context
            local_tmp_dir_path = tempfile.mkdtemp()
            job_context = JobContextImpl()

        print(sys.stdout.getvalue(), file=sys_stderr)
        print(sys.stderr.getvalue(), file=sys_stderr)

        clearOutputs()

        print('READY(port=' + str(listening_port) + ')', file=sys_stdout)
        sys_stdout.flush()

        while True:
            line = sys_stdin.readline()

            if line == '':
                break
            elif line == '\n':
                continue

            try:
                msg = json.loads(line)
            except ValueError:
                LOG.error('failed to parse message', exc_info=True)
                continue

            try:
                msg_type = msg['msg_type']
            except KeyError:
                LOG.error('missing message type', exc_info=True)
                continue

            try:
                content = msg['content']
            except KeyError:
                LOG.error('missing content', exc_info=True)
                continue

            if not isinstance(content, dict):
                LOG.error('content is not a dictionary')
                continue

            try:
                handler = msg_type_router[msg_type]
            except KeyError:
                LOG.error('unknown message type: %s', msg_type)
                continue

            response = handler(content)

            try:
                response = json.dumps(response)
            except ValueError:
                response = json.dumps({
                    'msg_type': 'inspect_reply',
                    'content': {
                        'status': 'error',
                        'ename': 'ValueError',
                        'evalue': 'cannot json-ify %s' % response,
                        'traceback': [],
                    }
                })

            print(response, file=sys_stdout)
            sys_stdout.flush()
    finally:
        if os.environ.get("LIVY_TEST") != "true" and 'sc' in global_dict:
            gateway.shutdown_callback_server()
            shutil.rmtree(local_tmp_dir_path)
            global_dict['sc'].stop()

        sys.stdin = sys_stdin
        sys.stdout = sys_stdout
        sys.stderr = sys_stderr
예제 #18
0
    def __init__(self, versionNumber):
        self.version = versionNumber

    def isAutoConvertEnabled(self):
        return self.version >= self.SPARK_1_4_0

    def isImportAllPackageUnderSparkSql(self):
        return self.version >= self.SPARK_1_3_0

output = Logger()
errorOutput = ErrorLogger()
sys.stdout = output
sys.stderr = errorOutput

try:
    client = GatewayClient(port=int(sys.argv[1]),
                           gateway_parameters=GatewayParameters(port = int(sys.argv[1]), auto_convert = True, auth_token = sys.argv[3]))
except:
    client = GatewayClient(port=int(sys.argv[1]))

sparkVersion = SparkVersion(int(sys.argv[2]))

if sparkVersion.isAutoConvertEnabled():
    try:
        gateway = JavaGateway(client, auto_field = True, auto_convert = True,
                              gateway_parameters=GatewayParameters(port = int(sys.argv[1]), auto_convert = True, auth_token = sys.argv[3]))
    except:
        gateway = JavaGateway(client, auto_convert = True)
else:
    gateway = JavaGateway(client)

java_import(gateway.jvm, "org.apache.spark.SparkEnv")
예제 #19
0
def launch_gateway(conf=None):
    """
    launch jvm gateway
    :param conf: configuration that ml_runner must have
    :return:
    """
    if "PYANGEL_GATEWAY_PORT" in os.environ:
        gateway_port = int(os.environ["PYANGEL_GATEWAY_PORT"])
    else:
        ANGEL_HOME = _find_angel_home()
        # Launch the Py4j gateway

        if os.environ.get("PYANGEL_LOCAL_MODE") == "True":
            script = "./bin/angel-local-submit"
        else:
            script = "./bin/angel-submit"
        command = [os.path.join(ANGEL_HOME, script)]
        if conf:
            for k, v in conf.getAll():
                command += ['--conf', '%s=%s' % (k, v)]
        submit_args = os.environ.get(
            "PYANGEL_SUBMIT_ARGS",
            "--angel.app.submit.class com.tencent.angel.api.python.PythonGatewayServer"
        )
        command = command + shlex.split(submit_args)

        # Start a socket that will be used by PythonGatewayServer to communicate its port to python sub-proc
        callback_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        callback_socket.bind(('127.0.0.1', 0))
        callback_socket.listen(1)
        callback_host, callback_port = callback_socket.getsockname()
        env = dict(os.environ)
        env['_PYANGEL_CALLBACK_HOST'] = callback_host
        env['_PYANGEL_CALLBACK_PORT'] = str(callback_port)

        # Don't send ctrl-c / SIGINT to the Java gateway:
        def preexec_func():
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)

        gateway_port = None
        # We use select() here in order to avoid blocking indefinitely if the subprocess dies
        # before connecting
        while gateway_port is None and proc.poll() is None:
            timeout = 1  # (seconds)
            readable, _, _ = select.select([callback_socket], [], [], timeout)
            if callback_socket in readable:
                gateway_connection = callback_socket.accept()[0]
                # Determine which ephemeral port the server started on:
                gateway_port = read_int(gateway_connection.makefile(mode="rb"))
                gateway_connection.close()
                callback_socket.close()
        if gateway_port is None:
            raise Exception(
                "Java gateway process exited before sending the driver its port number"
            )

    # Connect to the gateway
    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=True)

    return gateway
예제 #20
0
파일: jumpy.py 프로젝트: gumffskin/jumpy
sys.path.insert(1, here)

import imp, traceback, __builtin__
try: import simplejson as json
except: import json
from ConfigParser import RawConfigParser
from StringIO import StringIO
from UserDict import DictMixin
from py4j.java_gateway import GatewayClient, JavaGateway
import vmsg, imgfx

try: pms
except NameError:
	host, port = os.environ['JGATEWAY'].split(':')
	hascb, cbport = (True, int(os.getenv('JCLIENT'))) if 'JCLIENT' in os.environ else (False, None)
	gateway_client = GatewayClient(address=host, port=int(port))
	gateway = JavaGateway(gateway_client, start_callback_server=hascb, python_proxy_port=cbport, auto_convert=True)
	__builtin__.pms = gateway.entry_point
	__builtin__.pms.gateway_client = gateway_client
	__builtin__.pms._addItem = pms.addItem
	__builtin__.pms._addPath = pms.addPath
	__builtin__.pms._setEnv = pms.setEnv
	__builtin__.pms._addPlayer = pms.addPlayer
	# constants from net.pms.formats.Format:
	__builtin__.PMS_AUDIO = 1
	__builtin__.PMS_IMAGE = 2
	__builtin__.PMS_VIDEO = 4
	__builtin__.PMS_UNKNOWN = 8
	__builtin__.PMS_PLAYLIST = 16
	__builtin__.PMS_ISO = 32
	__builtin__.PMS_CUSTOM = 64
예제 #21
0
    def test_gateway_client(self):
        gateway_client = GatewayClient(port=DEFAULT_PORT)
        self.gateway = JavaGateway(gateway_client=gateway_client)

        i = self.gateway.jvm.System.currentTimeMillis()
        self.assertTrue(i > 0)
예제 #22
0
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from py4j.java_gateway import JavaGateway
from py4j.java_gateway import java_import, JavaGateway, GatewayClient

client = GatewayClient(port=%PORT%)
gateway = JavaGateway(client)
java_import(gateway.jvm, "org.apache.zeppelin.display.Input")

class PyZeppelinContext():
    paramOption = gateway.jvm.org.apache.zeppelin.display.Input.ParamOption
    javaList = gateway.jvm.java.util.ArrayList

    def __init__(self, zc):
        self.z = zc

    def input(self, name, defaultValue=""):
        return self.z.getGui().input(name, defaultValue)

    def select(self, name, options, defaultValue=""):
        javaOptions = gateway.new_array(self.paramOption, len(options))
                list(
                    filter(lambda x: not re.match("^__.*", x),
                           list(completionList))))
            self.interpreter.setStatementsFinished(result, False)


host = sys.argv[1]
port = int(sys.argv[2])

if "PY4J_GATEWAY_SECRET" in os.environ:
    from py4j.java_gateway import GatewayParameters
    gateway_secret = os.environ["PY4J_GATEWAY_SECRET"]
    gateway = JavaGateway(gateway_parameters=GatewayParameters(
        address=host, port=port, auth_token=gateway_secret, auto_convert=True))
else:
    gateway = JavaGateway(GatewayClient(address=host, port=port),
                          auto_convert=True)

intp = gateway.entry_point
_zcUserQueryNameSpace = {}

completion = PythonCompletion(intp, _zcUserQueryNameSpace)
_zcUserQueryNameSpace["__zeppelin_completion__"] = completion
_zcUserQueryNameSpace["gateway"] = gateway

from zeppelin_context import PyZeppelinContext
if intp.getZeppelinContext():
    z = __zeppelin__ = PyZeppelinContext(intp.getZeppelinContext(), gateway)
    __zeppelin__._setup_matplotlib()
    _zcUserQueryNameSpace["z"] = z
    _zcUserQueryNameSpace["__zeppelin__"] = __zeppelin__
예제 #24
0
파일: core.py 프로젝트: quartox/knit
    def start(self, cmd, num_containers=1, virtual_cores=1, memory=128, env="",
              files=[], app_name="knit", queue="default", checks=True,
              lang='C.UTF-8'):
        """
        Method to start a yarn app with a distributed shell

        Parameters
        ----------
        cmd: str
            command to run in each yarn container
        num_containers: int
            Number of containers YARN should request (default: 1)
            * A container should be requested with the number of cores it can
              saturate, i.e.
            * the average number of threads it expects to have runnable at a
              time.
        virtual_cores: int
            Number of virtual cores per container (default: 1)
            * A node's capacity should be configured with virtual cores equal to
            * its number of physical cores.
        memory: int
            Memory per container (default: 128)
            * The unit for memory is megabytes.
        env: string
            Full Path to zipped Python environment
        files: list
            list of files to be include in each container
        app_name: String
            Application name shown in YARN (default: "knit")
        queue: String
            RM Queue to use while scheduling (default: "default")
        checks: bool=True
            Whether to run pre-flight checks before submitting app to YARN
        lang: str
            Environment variable language setting, required for ``click`` to
            successfully read from the shell.

        Returns
        -------
        applicationId: str
            A yarn application ID string
        """
        if self.app_id:
            raise ValueError('Already started')
        if not isinstance(memory, int):
            raise KnitException("Memory argument must be an integer")
        if files:
            if not isinstance(files, list):
                raise KnitException("File argument must be a list of strings")

        if checks:
            self._pre_flight_checks(num_containers, virtual_cores, memory, env,
                                    files, queue)
        # From https://github.com/apache/spark/blob/d83c2f9f0b08d6d5d369d9fae04cdb15448e7f0d/python/pyspark/java_gateway.py
        # thank you spark

        # Start a socket that will be used by PythonGatewayServer to communicate its port to us
        callback_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        callback_socket.bind(('127.0.0.1', 0))
        callback_socket.listen(1)
        callback_host, callback_port = callback_socket.getsockname()

        if not os.path.exists(self.JAR_FILE_PATH):
            raise KnitException('JAR file %s does not exists - please build'
                                ' with maven' % self.JAR_FILE_PATH)
        args = ["hadoop", "jar", self.JAR_FILE_PATH, self.JAVA_APP,
                "--callbackHost", str(callback_host), "--callbackPort",
                str(callback_port)]

        # Launch the Java gateway.
        # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
        if not on_windows:
            # Don't send ctrl-c / SIGINT to the Java gateway:
            def preexec_func():
                signal.signal(signal.SIGINT, signal.SIG_IGN)
            proc = Popen(args, stdin=PIPE, preexec_fn=preexec_func)
        else:
            # preexec_fn not supported on Windows
            proc = Popen(args, stdin=PIPE)
        self.proc = proc
        gateway_port = None
        # We use select() here in order to avoid blocking indefinitely if the
        # subprocess dies before connecting
        long_timeout = 60
        while gateway_port is None and proc.poll() is None and long_timeout > 0:
            timeout = 1  # (seconds)
            readable, _, _ = select.select([callback_socket], [], [], timeout)
            if callback_socket in readable:
                gateway_connection = callback_socket.accept()[0]
                # Determine which ephemeral port the server started on:
                gateway_port = read_int(gateway_connection.makefile(mode="rb"))
                gateway_connection.close()
                callback_socket.close()
            long_timeout -= 1

        if gateway_port is None:
            raise Exception("The JVM Knit client failed to launch successfully."
                            " Check that java is installed and the Knit JAR"
                            " file exists.")

        gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=True)
        self.client = gateway.entry_point
        self.client_gateway = gateway
        upload = self.check_env_needs_upload(env)
        self.app_id = self.client.start(env, ','.join(files), app_name, queue,
                                        str(upload), lang)

        long_timeout = 100
        master_rpcport = -1
        while master_rpcport == -1:
            master_rpcport = self.client.masterRPCPort()
            time.sleep(0.2)
            long_timeout -= 0.2
            if long_timeout < 0:
                break

        if master_rpcport in [-1, 'N/A']:
            raise Exception(
"""The application master JVM process failed to report back. This can mean:
 - that the YARN cluster cannot scheduler adequate resources - check
   k.yarn_api.cluster_metrics() and other diagnostic methods;
 - that the ApplicationMaster crashed - check the application logs, k.logs();
 - that the cluster is otherwise unhealthy - check the RM and NN logs 
   (use k.yarn_api.system_logs() to find these on a one-node system
""")
        master_rpchost = self.client.masterRPCHost()

        gateway = JavaGateway(GatewayClient(
            address=master_rpchost, port=master_rpcport), auto_convert=True)
        self.master = gateway.entry_point
        self.master.init(env, ','.join(files), cmd, num_containers,
                         virtual_cores, memory)

        return self.app_id
예제 #25
0
 connectionTimeout = 15
 while port_number < 25335:
     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     s.bind(('', 0))
     port_number = s.getsockname()[1]
     s.close()
 try:
     p = subprocess.Popen(
         ['java', '-jar', 'FifteenPuzzleGateway.jar',
          str(port_number)])
     count = 0
     print("Connecting to server.  Please wait.")
     while count < connectionTimeout:
         time.sleep(1)
         gateway_server = JavaGateway(
             GatewayClient(address=host, port=port_number))
         count += 1
         connected = True
         try:
             gateway_server.entry_point.isConnected()
         except:
             connected = False
         if connected:
             break
         elif count % 2 == 0 and count < connectionTimeout:
             print(str(count) + " seconds passed, continue to wait.")
     if not connected:
         print("Connection time out over " + str(connectionTimeout) +
               " seconds")
         gateway_server.shutdown()
         p.kill()
예제 #26
0
파일: common.py 프로젝트: zhaonaiy/BigDL
 def __init__(self, bigdl_type, port=25333):
     self.value = JavaGateway(GatewayClient(port=port), auto_convert=True)
예제 #27
0
def launch_gateway(conf=None):
    """
    launch jvm gateway
    :param conf: spark configuration passed to spark-submit
    :return:
    """
    if "PYSPARK_GATEWAY_PORT" in os.environ:
        gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
    else:
        SPARK_HOME = _find_spark_home()
        # Launch the Py4j gateway using Spark's run command so that we pick up the
        # proper classpath and settings from spark-env.sh
        on_windows = platform.system() == "Windows"
        script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
        command = [os.path.join(SPARK_HOME, script)]
        if conf:
            for k, v in conf.getAll():
                command += ['--conf', '%s=%s' % (k, v)]
        submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
        if os.environ.get("SPARK_TESTING"):
            submit_args = ' '.join(
                ["--conf spark.ui.enabled=false", submit_args])
        command = command + shlex.split(submit_args)

        # Start a socket that will be used by PythonGatewayServer to communicate its port to us
        callback_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        callback_socket.bind(('127.0.0.1', 0))
        callback_socket.listen(1)
        callback_host, callback_port = callback_socket.getsockname()
        env = dict(os.environ)
        env['_PYSPARK_DRIVER_CALLBACK_HOST'] = callback_host
        env['_PYSPARK_DRIVER_CALLBACK_PORT'] = str(callback_port)

        # Launch the Java gateway.
        # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
        if not on_windows:
            # Don't send ctrl-c / SIGINT to the Java gateway:
            def preexec_func():
                signal.signal(signal.SIGINT, signal.SIG_IGN)

            proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
        else:
            # preexec_fn not supported on Windows
            proc = Popen(command, stdin=PIPE, env=env)

        gateway_port = None
        # We use select() here in order to avoid blocking indefinitely if the subprocess dies
        # before connecting
        while gateway_port is None and proc.poll() is None:
            timeout = 1  # (seconds)
            readable, _, _ = select.select([callback_socket], [], [], timeout)
            if callback_socket in readable:
                gateway_connection = callback_socket.accept()[0]
                # Determine which ephemeral port the server started on:
                gateway_port = read_int(gateway_connection.makefile(mode="rb"))
                gateway_connection.close()
                callback_socket.close()
        if gateway_port is None:
            raise Exception(
                "Java gateway process exited before sending the driver its port number"
            )

        # In Windows, ensure the Java child processes do not linger after Python has exited.
        # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when
        # the parent process' stdin sends an EOF). In Windows, however, this is not possible
        # because java.lang.Process reads directly from the parent process' stdin, contending
        # with any opportunity to read an EOF from the parent. Note that this is only best
        # effort and will not take effect if the python process is violently terminated.
        if on_windows:
            # In Windows, the child process here is "spark-submit.cmd", not the JVM itself
            # (because the UNIX "exec" command is not available). This means we cannot simply
            # call proc.kill(), which kills only the "spark-submit.cmd" process but not the
            # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all
            # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx)
            def killChild():
                Popen([
                    "cmd", "/c", "taskkill", "/f", "/t", "/pid",
                    str(proc.pid)
                ])

            atexit.register(killChild)

    # Connect to the gateway
    gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=True)

    # Import the classes used by PySpark
    java_import(gateway.jvm, "org.apache.spark.SparkConf")
    java_import(gateway.jvm, "org.apache.spark.api.java.*")
    java_import(gateway.jvm, "org.apache.spark.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.ml.python.*")
    java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
    # TODO(davies): move into sql
    java_import(gateway.jvm, "org.apache.spark.sql.*")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
    java_import(gateway.jvm, "scala.Tuple2")

    return gateway
예제 #28
0
def main():
    sys_stdin = sys.stdin
    sys_stdout = sys.stdout
    sys_stderr = sys.stderr

    if sys.version >= '3':
        sys.stdin = io.StringIO()
    else:
        sys.stdin = cStringIO.StringIO()

    sys.stdout = UnicodeDecodingStringIO()
    sys.stderr = UnicodeDecodingStringIO()

    spark_major_version = os.getenv("LIVY_SPARK_MAJOR_VERSION")
    try:
        listening_port = 0
        if os.environ.get("LIVY_TEST") != "true":
            #Load spark into the context
            exec('from pyspark.sql import HiveContext', global_dict)
            exec('from pyspark.streaming import StreamingContext', global_dict)
            exec('import pyspark.cloudpickle as cloudpickle', global_dict)

            from py4j.java_gateway import java_import, JavaGateway, GatewayClient
            from pyspark.conf import SparkConf
            from pyspark.context import SparkContext
            from pyspark.sql import SQLContext, HiveContext, Row
            # Connect to the gateway
            gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
            try:
                from py4j.java_gateway import GatewayParameters
                gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"]
                gateway = JavaGateway(gateway_parameters=GatewayParameters(
                    port=gateway_port,
                    auth_token=gateway_secret,
                    auto_convert=True))
            except:
                gateway = JavaGateway(GatewayClient(port=gateway_port),
                                      auto_convert=True)

            # Import the classes used by PySpark
            java_import(gateway.jvm, "org.apache.spark.SparkConf")
            java_import(gateway.jvm, "org.apache.spark.api.java.*")
            java_import(gateway.jvm, "org.apache.spark.api.python.*")
            java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
            java_import(gateway.jvm, "org.apache.spark.sql.*")
            java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
            java_import(gateway.jvm, "scala.Tuple2")

            jsc = gateway.entry_point.sc()
            jconf = gateway.entry_point.sc().getConf()
            jsqlc = gateway.entry_point.hivectx() if gateway.entry_point.hivectx() is not None \
                else gateway.entry_point.sqlctx()

            conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf)
            sc = SparkContext(jsc=jsc, gateway=gateway, conf=conf)
            global_dict['sc'] = sc

            if spark_major_version >= "2":
                from pyspark.sql import SparkSession
                spark_session = SparkSession(
                    sc, gateway.entry_point.sparkSession())
                sqlc = SQLContext(sc, spark_session, jsqlc)
                global_dict['sqlContext'] = sqlc
                global_dict['spark'] = spark_session
            else:
                sqlc = SQLContext(sc, jsqlc)
                global_dict['sqlContext'] = sqlc

                # LIVY-294, need to check whether HiveContext can work properly,
                # fallback to SQLContext if HiveContext can not be initialized successfully.
                # Only for spark-1.
                code = textwrap.dedent("""
                    import py4j
                    from pyspark.sql import SQLContext
                    try:
                      sqlContext.tables()
                    except py4j.protocol.Py4JError:
                      sqlContext = SQLContext(sc)""")
                exec(code, global_dict)

            #Start py4j callback server
            from py4j.protocol import ENTRY_POINT_OBJECT_ID
            from py4j.java_gateway import CallbackServerParameters

            try:
                gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"]
                gateway.start_callback_server(
                    callback_server_parameters=CallbackServerParameters(
                        port=0, auth_token=gateway_secret))
            except:
                gateway.start_callback_server(
                    callback_server_parameters=CallbackServerParameters(
                        port=0))

            socket_info = gateway._callback_server.server_socket.getsockname()
            listening_port = socket_info[1]
            pyspark_job_processor = PySparkJobProcessorImpl()
            gateway.gateway_property.pool.dict[
                ENTRY_POINT_OBJECT_ID] = pyspark_job_processor

            global local_tmp_dir_path, job_context
            local_tmp_dir_path = tempfile.mkdtemp()
            job_context = JobContextImpl()

        print(sys.stdout.getvalue(), file=sys_stderr)
        print(sys.stderr.getvalue(), file=sys_stderr)

        clearOutputs()

        print('READY(port=' + str(listening_port) + ')', file=sys_stdout)
        sys_stdout.flush()

        while True:
            line = sys_stdin.readline()

            if line == '':
                break
            elif line == '\n':
                continue

            try:
                msg = json.loads(line)
            except ValueError:
                LOG.error('failed to parse message', exc_info=True)
                continue

            try:
                msg_type = msg['msg_type']
            except KeyError:
                LOG.error('missing message type', exc_info=True)
                continue

            try:
                content = msg['content']
            except KeyError:
                LOG.error('missing content', exc_info=True)
                continue

            if not isinstance(content, dict):
                LOG.error('content is not a dictionary')
                continue

            try:
                handler = msg_type_router[msg_type]
            except KeyError:
                LOG.error('unknown message type: %s', msg_type)
                continue

            response = handler(content)

            try:
                response = json.dumps(response)
            except ValueError:
                response = json.dumps({
                    'msg_type': 'inspect_reply',
                    'content': {
                        'status': 'error',
                        'ename': 'ValueError',
                        'evalue': 'cannot json-ify %s' % response,
                        'traceback': [],
                    }
                })

            print(response, file=sys_stdout)
            sys_stdout.flush()
    finally:
        if os.environ.get("LIVY_TEST") != "true" and 'sc' in global_dict:
            gateway.shutdown_callback_server()
            shutil.rmtree(local_tmp_dir_path)
            global_dict['sc'].stop()

        sys.stdin = sys_stdin
        sys.stdout = sys_stdout
        sys.stderr = sys_stderr
예제 #29
0
            matplotlib.use('Agg')
            warnings.warn("Unable to load inline matplotlib backend, "
                          "falling back to Agg")


def handler_stop_signals(sig, frame):
    sys.exit("Got signal : " + str(sig))


signal.signal(signal.SIGINT, handler_stop_signals)

host = "127.0.0.1"
if len(sys.argv) >= 3:
    host = sys.argv[2]

client = GatewayClient(address=host, port=int(sys.argv[1]))

#gateway = JavaGateway(client, auto_convert = True)
gateway = JavaGateway(client)

intp = gateway.entry_point
intp.onPythonScriptInitialized(os.getpid())

z = PyZeppelinContext()
z._setup_matplotlib()

output = Logger()
sys.stdout = output
#sys.stderr = output

while True:
예제 #30
0
파일: java_gateway.py 프로젝트: hufh/mrgeo
def launch_gateway(host=None, port=None):
    global _isremote
    global _forked_proc
    requesthost = socket.gethostname()
    requestport = 0

    # Launch the Py4j gateway using the MrGeo command so that we pick up the proper classpath

    fork = True

    if host is not None and port is not None:
        requesthost = host
        requestport = port
        fork = False
    else:
        if "MRGEO_HOST" in os.environ:
            requesthost = os.environ["MRGEO_HOST"]
            fork = False

        if "MRGEO_PORT" in os.environ:
            requestport = int(os.environ["MRGEO_PORT"])
            fork = False

    if port is not None and requestport == 0:
        requestport = port

    # If we didn't get a request port, get one.  We open a socket to make sure we get an unused
    # port, without guessing,
    if requestport == 0:
        tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        tmp_socket.settimeout(0.01)
        tmp_socket.bind((requesthost, 0))
        # tmp_socket.listen(1)

        name, requestport = tmp_socket.getsockname()
        tmp_socket.close()

    if fork:
        # Start a socket that will be used by PythonGatewayServer to communicate its port to us

        script = find_script()

        # command = [script, "python", "-v", "-p", str(requestport)]
        command = [script, "python", "-p", str(requestport)]

        environ = os.environ
        # Add some more memory
        environ['HADOOP_CLIENT_OPTS'] = '-Xmx12G ' + environ.get(
            'HADOOP_CLIENT_OPTS', '')

        # Allow remote debugging
        # environ['HADOOP_CLIENT_OPTS'] = '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005 ' + environ.get('HADOOP_CLIENT_OPTS', '')

        # Launch the Java gateway.
        # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
        # Don't send ctrl-c / SIGINT to the Java gateway:
        def preexec_func():
            os.setsid()
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        _forked_proc = Popen(command,
                             stdin=PIPE,
                             preexec_fn=preexec_func,
                             env=environ,
                             bufsize=1,
                             universal_newlines=True)

        # while True:
        #     out = _forked_proc.stdout.read(1)
        #
        #     # print("[" + out + "] " + str(_forked_proc.poll()))
        #     if out != '':
        #         break
        #
        #     if _forked_proc.poll() is not None:
        #         raise Exception("Java gateway process exited before sending the driver its port number: returned: " +
        #                         str(_forked_proc.poll()))

        # time.sleep(5)
        # We use select() here in order to avoid blocking indefinitely if the subprocess dies
        # before connecting
        # while proc.poll() is None:
        #     pass

        # _forked_proc.stdout = subprocess.STDOUT

        atexit.register(terminate)

    timeout = 30  # (seconds)
    request_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    request_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

    start = time.time()
    connected = -1
    while (time.time() - start) < timeout and connected != 0:
        connected = request_socket.connect_ex((requesthost, requestport))
        time.sleep(0.5)

    if connected != 0:
        raise Exception("Could not connect to the java gateway process")

    readable, writable, error = select.select([request_socket], [], [],
                                              timeout)

    # read the communication port from the server
    if request_socket in readable:

        data = ""
        while len(data) < 8:
            # keep it to 4 bytes (an int)
            data += request_socket.recv(8)

        java_python_port, python_java_port = struct.unpack("!ii", data)
        request_socket.close()
    else:
        raise Exception("Port is not readable")

    _isremote = not fork

    if java_python_port is None:
        raise Exception(
            "Java gateway process exited before sending the driver its port number"
        )

    print("Talking with MrGeo on port " + str(java_python_port))

    # Connect to the gateway
    gateway_client = GatewayClient(address=requesthost, port=java_python_port)
    gateway = JavaGateway(gateway_client=gateway_client,
                          auto_convert=True,
                          python_proxy_port=python_java_port)

    # Import the classes used by MrGeo
    java_import(gateway.jvm, "org.mrgeo.python.*")

    # Import classes used by Spark
    java_import(gateway.jvm, "org.apache.spark.SparkConf")
    java_import(gateway.jvm, "org.apache.spark.api.java.*")
    java_import(gateway.jvm, "org.apache.spark.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")

    return gateway, gateway_client