def __init__(self, language, gateway, **kwargs): ''' Constructor @param language: The language code for the proper initialization of this language-dependent tool @type language: string @param gateway: An already initialized Py4j java gateway @type gateway: py4j.java_gateway.JavaGateway ''' self.language = language #self.jvm = JVM(java_classpath) #socket_no = self.jvm.socket_no #gatewayclient = GatewayClient('localhost', socket_no) #gateway = JavaGateway(gatewayclient, auto_convert=True, auto_field=True) #sys.stderr.write("Initialized local Java gateway with pid {} in socket {}\n".format(self.jvm.pid, socket_no)) self.meteor_view = gateway.new_jvm_view() #import necessary java packages from meteor jar java_import(self.meteor_view, 'edu.cmu.meteor.scorer.*') java_import(self.meteor_view, 'edu.cmu.meteor.util.*') # java_import(self.meteor_view, '') #pass the language setting into the meteor configuration object config = self.meteor_view.MeteorConfiguration(); config.setLanguage(language); #initialize object with the given config sys.stderr.write("If next line displays error, it is not critical, but METEOR language-specific transducer must be installed.") self.scorer = self.meteor_view.MeteorScorer(config)
def start_gateway_server(): classPath = compute_classpath(DDF_HOME) # launch GatewayServer in a new process javaopts = os.getenv('JAVA_OPTS') if javaopts is not None: javaopts = javaopts.split() else: javaopts = [] #command = ["java", "-classpath", classPath] + ["-Dlog4j.configuration=file:"+ DDF_HOME + "/core/conf/local/ddf-local-log4j.properties"] + ["py4j.GatewayServer", "--die-on-broken-pipe", "0"] command = ["java", "-classpath", classPath] + javaopts + ["py4j.GatewayServer", "--die-on-broken-pipe", "0"] proc = Popen(command, stdout = PIPE, stdin = PIPE, preexec_fn = preexec_func) # get the port of the GatewayServer port = int(proc.stdout.readline()) class JavaOutputThread(Thread): def __init__(self, stream): Thread.__init__(self) self.daemon = True self.stream = stream def run(self): while True: line = self.stream.readline() sys.stderr.write(line) JavaOutputThread(proc.stdout).start() # connect to the gateway server gateway = JavaGateway(GatewayClient(port = port), auto_convert = False) java_import(gateway.jvm, "io.ddf.*") java_import(gateway.jvm, "io.ddf.spark.*") return gateway
def __init__(self, mrgeo): self._mrgeo = mrgeo jvm = self._mrgeo._get_jvm() # Import the raster map op test support class and all other needed classes java_import(jvm, "org.mrgeo.mapalgebra.InlineCsvMapOp") self._jvm = jvm self._sparkContext = mrgeo.sparkContext
def _do_init(self, *args, **kwargs): # Modifies base _do_init to add a Java-Cassandra SparkContext (jcsc) # to the instance super(CassandraSparkContext, self)._do_init(*args, **kwargs) java_import(self._jvm, "com.datastax.spark.connector.CassandraJavaUtil") java_import(self._jvm, "com.datastax.spark.connector.RowConvertingIterator") self._jcsc = self._jvm.CassandraJavaUtil.javaFunctions(self._jsc)
def singlethread(java_classpath): print "Thread starting" jvm = JVM(java_classpath, dir_path) socket_no = self.jvm.socket_no gatewayclient = GatewayClient('localhost', socket_no) gateway = JavaGateway(gatewayclient, auto_convert=True, auto_field=True) sys.stderr.write("Initialized global Java gateway with pid {} in socket {}\n".format(self.jvm.pid, socket_no)) gatewayclient = GatewayClient('localhost', socket_no) print "Gclient started" gateway = JavaGateway(gatewayclient, auto_convert=True, auto_field=True) print "Java Gateway started" #create a new view for the jvm meteor_view = gateway.new_jvm_view() #import required packages java_import(meteor_view, 'edu.cmu.meteor.scorer.*') #initialize the java object java_import(meteor_view, 'edu.cmu.meteor.util.*') print "Modules imported" #pass the language setting into the meteor configuration object config = meteor_view.MeteorConfiguration(); config.setLanguage("en"); scorer = meteor_view.MeteorScorer(config) print "object initialized" #run object function stats = scorer.getMeteorStats("Test sentence", "Test sentence !"); print stats.score return 1
def main(): if len(sys.argv) != 3: print >> sys.stderr, "Usage: example <keyspace_name> <column_family_name>" sys.exit(-1) keyspace_name = sys.argv[1] column_family_name = sys.argv[2] # Valid config options here https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md conf = SparkConf().set("spark.cassandra.connection.host", "127.0.0.1") sc = SparkContext(appName="Spark + Cassandra Example", conf=conf) # import time; time.sleep(30) java_import(sc._gateway.jvm, "com.datastax.spark.connector.CassandraJavaUtil") print sc._jvm.CassandraJavaUtil users = ( ["Mike", "Sukmanowsky"], ["Andrew", "Montalenti"], ["Keith", "Bourgoin"], ) rdd = sc.parallelize(users) print rdd.collect()
def scala_set_to_set(ctx, x): from py4j.java_gateway import java_import # import scala java_import(ctx._jvm, 'scala') # grab Scala's set converter and convert to a Python set return set(ctx._jvm.scala.collection.JavaConversions.setAsJavaSet(x))
def get_smoothing_method(self, spark_context): java_import(spark_context._jvm, ClassNames.WEIGHTS) java_import(spark_context._jvm, ClassNames.WEIGHTED_MOVING_AVERAGE) java_weights = spark_context._jvm.Weights(self.__python_weights.limit()) for index in range(self.__window_size): java_weights.add(self.__python_weights.get(index)) return spark_context._jvm.WeightedMovingAverageMethod(self.__window_size, java_weights)
def _connect(self, gateway, grammarfile): module_view = gateway.new_jvm_view() java_import(module_view, 'BParser') # get the application instance log.info("Grammar file: {}".format(grammarfile)) self.bp_obj = module_view.BParser(grammarfile) sys.stderr.write("got BParser object\n")
def launch_gateway(): if "MRGEO_GATEWAY_PORT" in os.environ: gateway_port = int(os.environ["MRGEO_GATEWAY_PORT"]) else: # Launch the Py4j gateway using the MrGeo command so that we pick up the proper classpath script = find_script() # Start a socket that will be used by PythonGatewayServer to communicate its port to us callback_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) callback_socket.bind(('127.0.0.1', 0)) callback_socket.listen(1) callback_host, callback_port = callback_socket.getsockname() env = dict(os.environ) env['_MRGEO_DRIVER_CALLBACK_HOST'] = callback_host env['_MRGEO_DRIVER_CALLBACK_PORT'] = str(callback_port) command = [script, "python", "-v", "-h", callback_host, "-p", str(callback_port)] # Launch the Java gateway. # We open a pipe to stdin so that the Java gateway can die when the pipe is broken # Don't send ctrl-c / SIGINT to the Java gateway: def preexec_func(): signal.signal(signal.SIGINT, signal.SIG_IGN) proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env) gateway_port = None # We use select() here in order to avoid blocking indefinitely if the subprocess dies # before connecting while gateway_port is None and proc.poll() is None: timeout = 1 # (seconds) readable, _, _ = select.select([callback_socket], [], [], timeout) if callback_socket in readable: gateway_connection = callback_socket.accept()[0] # Determine which ephemeral port the server started on: gateway_port = read_int(gateway_connection.makefile(mode="rb")) gateway_connection.close() callback_socket.close() if gateway_port is None: raise Exception("Java gateway process exited before sending the driver its port number") print("Talking with MrGeo on port " + str(gateway_port)) # Connect to the gateway gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=True) # Import the classes used by MrGeo java_import(gateway.jvm, "org.mrgeo.python.*") # Import classes used by Spark java_import(gateway.jvm, "org.apache.spark.SparkConf") java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") return gateway
def _getSome(self, value): java_import(self._jvm, "scala.Some") return self._jvm.Some(value) # suite = TestLoader().loadTestsFromTestCase(MrGeoLocalIntegrationTests) # TextTestRunner(verbosity=2).run(suite)
def setUp(self): conf = SparkConf().setAppName('test').setMaster('local[*]') pwd = os.path.dirname(os.path.realpath(__file__)) metastore_dir = os.path.abspath(os.path.join(pwd, '..', 'metastore_db')) silentremove(os.path.join(metastore_dir, "dbex.lck")) silentremove(os.path.join(metastore_dir, "db.lck")) self.sc = SparkContext(conf=conf) self.jvm = self.sc._gateway.jvm java_import(self.jvm, "org.apache.spark.sql.*")
def createColor(r, g, b): global _gateway if _gateway is None: _gateway = JavaGateway() jvm = _gateway.jvm java_import(jvm, 'org.eclipse.swt.graphics.*') return jvm.Color(None, r, g, b)
def createHistogramBound(position, r, g, b): global _gateway if _gateway is None: _gateway = JavaGateway() jvm = _gateway.jvm java_import(jvm, 'org.eclipse.dawnsci.plotting.api.histogram.*') return jvm.HistogramBound(position, r, g, b)
def getService(serviceClass): global _gateway if _gateway is None: _gateway = JavaGateway() jvm = _gateway.jvm java_import(jvm, 'org.dawb.common.services.*') return jvm.Activator.getService(serviceClass)
def __init__(self, _jvm, smvconfig): self._jvm = _jvm self.smvconfig = smvconfig self.dsRepoFactories = [] from py4j.java_gateway import java_import java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper") java_import(self._jvm, "org.tresamigos.smv.DataSetRepoFactoryPython") self.helper = self._jvm.SmvPythonHelper
def getPlottingSystem(plottingSystemName): global _gateway if _gateway is None: _gateway = JavaGateway() jvm = _gateway.jvm java_import(jvm, 'org.eclipse.dawnsci.plotting.api.*') return jvm.PlottingFactory.getPlottingSystem(plottingSystemName, True)
def new_gateway_client(): global __gateway_server_port if __gateway_server_port is None: __gateway_server_port = start_gateway_server() # connect to the gateway server gateway = JavaGateway(GatewayClient(port=__gateway_server_port), auto_convert=False) java_import(gateway.jvm, 'io.ddf.*') java_import(gateway.jvm, 'io.ddf.spark.*') return gateway
def _create_job(self): if not self._job: jvm = self._get_jvm() java_import(jvm, "org.mrgeo.job.*") appname = "PyMrGeo" self._job = jvm.JobArguments() java_gateway.set_field(self._job, "name", appname) # Yarn in the default self.useyarn()
def __set_file_type(self, jvm, file_type): java_import(jvm, ClassNames.FileType) file_types = { 'CSV': jvm.FileType.CSV, 'TSV': jvm.FileType.TSV } if file_type in file_types.values(): self.__file_type = file_type elif file_type.upper() in file_types: self.__file_type = file_types[file_type.upper()] else: raise ValueError('"%s" is not a valid file type\nValid file types are CSV and TSV' % file_type)
def __init__(self, language, gateway, **kwargs): ''' Constructor ''' self.language = language ltool_view = gateway.new_jvm_view() java_import(ltool_view, 'org.languagetool.Languages') java_import(ltool_view, 'org.languagetool.JLanguageTool') if language=='ru': language = 'ru-RU' tool_language = ltool_view.Languages.getLanguageForShortName(language) self.ltool = ltool_view.JLanguageTool(tool_language)
def run(self): input = self.setupInputStreams(self.options.num_streams) output = self.setupOutputStream(input) output.count().pprint() sc = self.ssc.sparkContext java_import(sc._jvm, "org.apache.spark.streaming.scheduler.StatsReportListener") numBatches = int(self.options.total_duration / self.options.batch_duration) listener = sc._jvm.StatsReportListener(numBatches) self.ssc._jssc.addStreamingListener(listener) self.ssc.start() startTime = time.time() time.sleep(self.options.total_duration) self.ssc.stop(False, True) return self.processResults(listener)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def test_dir_jvmview_two(): with example_app_process(): with gateway() as g: view1 = g.new_jvm_view() view2 = g.new_jvm_view() helper_dir_jvmview(view1) helper_dir_jvmview(view2) # now give them different contents java_import(view1, "com.fourth.Class4") java_import(view2, "com.fiftg.Class5") assert sorted(dir(view1)) == [UserHelpAutoCompletion.KEY, "Class1", "Class2", "Class3", "Class4"] assert sorted(dir(view2)) == [UserHelpAutoCompletion.KEY, "Class1", "Class2", "Class3", "Class5"]
def copy(cls, srcfile, srcpath=None, dstpath=None, dstfile=None): jvm = cls.mrgeo._get_jvm() java_import(jvm, "org.mrgeo.hdfs.utils.HadoopFileUtils") java_import(jvm, "org.apache.hadoop.fs.Path") if srcpath is not None: src = srcpath if not src.endswith('/'): src += '/' src += srcfile else: src = srcfile if not os.path.exists(src): if os.path.exists(cls.inputdir + src): src = cls.inputdir + src if not os.path.exists(src): raise Exception("Source (" + src + ") is not a file or directory") if dstfile is not None: dst = dstfile if not dst.endswith('/'): dst += '/' dst += dstfile if not os.path.isfile(src): raise Exception("Source (" + src + ") is must be a file") if jvm.HadoopFileUtils.exists(dst): jvm.HadoopFileUtils.delete(dst) jvm.HadoopFileUtils.copyFileToHdfs(src, dst) return dst elif dstpath is not None: dst = dstpath else: dst = cls.inputhdfs basefile = os.path.basename(src) dstfile = dst + basefile if jvm.HadoopFileUtils.exists(dstfile): jvm.HadoopFileUtils.delete(dstfile) jvm.HadoopFileUtils.copyToHdfs(src, dst) return dstfile
def create_gateway(): """ Initialize a gateway with default port and address :return: JavaGateway """ gateway = None try: gateway = JavaGateway(GatewayClient(), auto_convert=True) except Py4JNetworkError as err: LOG.error("Failed to connect. Please make sure Spaceship gateway is running: %r", err) # import spaceship code java_import(gateway.jvm, "edu.gatech.sunlab.spaceship.api.py.*") return gateway
def comparevector(self, vector, testname): if self.GENERATE_BASELINE_DATA: self.savevector(vector, str(testname)) else: jvm = self.mrgeo._get_jvm() # test = raster.mapop.toDataset(False) java_import(jvm, "org.mrgeo.hdfs.vector.DelimitedVectorReader") testvector = str(self.outputhdfs + testname + ".tsv") vector.ssave(testvector) expectedvector = str(self.inputdir + testname + ".tsv") vdp_expected = jvm.DataProviderFactory.getVectorDataProvider( expectedvector, jvm.DataProviderFactory.AccessMode.READ, jvm.HadoopUtils.createConfiguration()) expected_geom_reader = vdp_expected.getVectorReader().get() vdp = jvm.DataProviderFactory.getVectorDataProvider( testvector, jvm.DataProviderFactory.AccessMode.READ, jvm.HadoopUtils.createConfiguration()) self.assertTrue(vdp is not None) vector_reader = vdp.getVectorReader() self.assertTrue(vector_reader is not None) self.assertTrue(is_instance_of(self.mrgeo.gateway, vector_reader, jvm.DelimitedVectorReader)) self.assertEquals(vdp_expected.getVectorReader().count(), vector_reader.count()) geom_reader = vector_reader.get() self.assertTrue(geom_reader is not None) while expected_geom_reader.hasNext(): expected_geom = expected_geom_reader.next() geom = geom_reader.next() self.assertTrue(geom is not None) self.assertEquals(expected_geom.type(), geom.type()) self.assertAlmostEquals(float(expected_geom.getAttribute("COST_S")), float(geom.getAttribute("COST_S")), delta=0.001) self.assertAlmostEquals(float(expected_geom.getAttribute("DISTANCE_M")), float(geom.getAttribute("DISTANCE_M")), delta=0.001) self.assertAlmostEquals(float(expected_geom.getAttribute("MINSPEED_MPS")), float(geom.getAttribute("MINSPEED_MPS")), delta=0.001) self.assertAlmostEquals(float(expected_geom.getAttribute("MAXSPEED_MPS")), float(geom.getAttribute("MAXSPEED_MPS")), delta=0.001) self.assertAlmostEquals(float(expected_geom.getAttribute("AVGSPEED_MPS")), float(geom.getAttribute("AVGSPEED_MPS")), delta=0.001) # Should not be any more geometries in the actual output self.assertFalse(geom_reader.hasNext()) jvm.HadoopFileUtils.delete(testvector)
def java_gateway(): from py4j.java_gateway import java_import, JavaGateway gateway = JavaGateway(auto_convert=True) jvm = gateway.jvm java_import(jvm, 'com.netflix.hystrix.util.HystrixRollingNumber') java_import(jvm, 'com.netflix.hystrix.util.HystrixRollingNumberEvent') java_import(jvm, 'com.netflix.hystrix.HystrixCommandProperties') java_import(jvm, 'com.netflix.hystrix.HystrixCommandProperties.Setter') return gateway
def createStream(ssc, zkQuorum, groupId, topics, kafkaParams={}, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ Create an input stream that pulls messages from a Kafka Broker. :param ssc: StreamingContext object :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..). :param groupId: The group id for this consumer. :param topics: Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread. :param kafkaParams: Additional params for Kafka :param storageLevel: RDD storage level. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A DStream object """ java_import(ssc._jvm, "org.apache.spark.streaming.kafka.KafkaUtils") kafkaParams.update({ "zookeeper.connect": zkQuorum, "group.id": groupId, "zookeeper.connection.timeout.ms": "10000", }) if not isinstance(topics, dict): raise TypeError("topics should be dict") jtopics = MapConverter().convert(topics, ssc.sparkContext._gateway._gateway_client) jparam = MapConverter().convert(kafkaParams, ssc.sparkContext._gateway._gateway_client) jlevel = ssc._sc._getJavaStorageLevel(storageLevel) def getClassByName(name): return ssc._jvm.org.apache.spark.util.Utils.classForName(name) try: array = getClassByName("[B") decoder = getClassByName("kafka.serializer.DefaultDecoder") jstream = ssc._jvm.KafkaUtils.createStream(ssc._jssc, array, array, decoder, decoder, jparam, jtopics, jlevel) except Py4JError, e: # TODO: use --jar once it also work on driver if not e.message or 'call a package' in e.message: print "No kafka package, please put the assembly jar into classpath:" print " $ bin/spark-submit --driver-class-path external/kafka-assembly/target/" + \ "scala-*/spark-streaming-kafka-assembly-*.jar" raise e
def start_gateway_server(): classpath = compute_classpath(DDF_HOME) java_opts = os.getenv('JAVA_OPTS') if java_opts is not None: java_opts = java_opts.split() else: java_opts = [] # set log options and memory configuration if not any([s.startswith('-Dlog4j.configuration') for s in java_opts]): java_opts += ['-Dlog4j.configuration=file:{}/core/conf/local/ddf-local-log4j.properties'.format(DDF_HOME)] if not any([s.startswith('-Xms') for s in java_opts]): java_opts += ['-Xms128m'] if not any([s.startswith('-Xmx') for s in java_opts]): java_opts += ['-Xmx512m'] if not any([s.startswith('-XX:MaxPermSize') for s in java_opts]): java_opts += ['-XX:MaxPermSize=512m'] command = ['java', '-classpath', classpath] + java_opts + ['py4j.GatewayServer', '--die-on-broken-pipe', '0'] # launch GatewayServer in a new process process = Popen(command, stdout=PIPE, stdin=PIPE, preexec_fn=pre_exec_func) # get the port of the GatewayServer port = int(process.stdout.readline()) class JavaOutputThread(Thread): def __init__(self, stream): Thread.__init__(self) self.daemon = True self.stream = stream def run(self): while True: line = self.stream.readline() sys.stderr.write(line) JavaOutputThread(process.stdout).start() # connect to the gateway server gateway = JavaGateway(GatewayClient(port=port), auto_convert=False) java_import(gateway.jvm, 'io.ddf.*') java_import(gateway.jvm, 'io.ddf.spark.*') return gateway
def import_flink_view(gateway): """ import the classes used by PyFlink. :param gateway:gateway connected to JavaGateWayServer """ # Import the classes used by PyFlink java_import(gateway.jvm, "org.apache.flink.table.api.*") java_import(gateway.jvm, "org.apache.flink.table.api.java.*") java_import(gateway.jvm, "org.apache.flink.table.api.dataview.*") java_import(gateway.jvm, "org.apache.flink.table.descriptors.*") java_import(gateway.jvm, "org.apache.flink.table.sources.*") java_import(gateway.jvm, "org.apache.flink.table.sinks.*") java_import(gateway.jvm, "org.apache.flink.table.types.*") java_import(gateway.jvm, "org.apache.flink.table.types.logical.*") java_import(gateway.jvm, "org.apache.flink.table.util.python.*") java_import(gateway.jvm, "org.apache.flink.api.common.python.*") java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.TypeInformation") java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.Types") java_import(gateway.jvm, "org.apache.flink.api.java.ExecutionEnvironment") java_import(gateway.jvm, "org.apache.flink.streaming.api.environment.StreamExecutionEnvironment")
def launch_gateway(): if "PYSPARK_GATEWAY_PORT" in os.environ: gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"]) else: SPARK_HOME = os.environ["SPARK_HOME"] # Launch the Py4j gateway using Spark's run command so that we pick up the # proper classpath and settings from spark-env.sh on_windows = platform.system() == "Windows" script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit" submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell") if os.environ.get("SPARK_TESTING"): submit_args = "--conf spark.ui.enabled=false " + submit_args command = [os.path.join(SPARK_HOME, script)] + shlex.split(submit_args) # Start a socket that will be used by PythonGatewayServer to communicate its port to us callback_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) callback_socket.bind(('127.0.0.1', 0)) callback_socket.listen(1) callback_host, callback_port = callback_socket.getsockname() env = dict(os.environ) env['_PYSPARK_DRIVER_CALLBACK_HOST'] = callback_host env['_PYSPARK_DRIVER_CALLBACK_PORT'] = str(callback_port) # Launch the Java gateway. # We open a pipe to stdin so that the Java gateway can die when the pipe is broken if not on_windows: # Don't send ctrl-c / SIGINT to the Java gateway: def preexec_func(): signal.signal(signal.SIGINT, signal.SIG_IGN) proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env) else: # preexec_fn not supported on Windows proc = Popen(command, stdin=PIPE, env=env) gateway_port = None # We use select() here in order to avoid blocking indefinitely if the subprocess dies # before connecting while gateway_port is None and proc.poll() is None: timeout = 1 # (seconds) readable, _, _ = select.select([callback_socket], [], [], timeout) if callback_socket in readable: gateway_connection = callback_socket.accept()[0] # Determine which ephemeral port the server started on: gateway_port = read_int(gateway_connection.makefile(mode="rb")) gateway_connection.close() callback_socket.close() if gateway_port is None: raise Exception( "Java gateway process exited before sending the driver its port number" ) # In Windows, ensure the Java child processes do not linger after Python has exited. # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when # the parent process' stdin sends an EOF). In Windows, however, this is not possible # because java.lang.Process reads directly from the parent process' stdin, contending # with any opportunity to read an EOF from the parent. Note that this is only best # effort and will not take effect if the python process is violently terminated. if on_windows: # In Windows, the child process here is "spark-submit.cmd", not the JVM itself # (because the UNIX "exec" command is not available). This means we cannot simply # call proc.kill(), which kills only the "spark-submit.cmd" process but not the # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx) def killChild(): Popen([ "cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid) ]) atexit.register(killChild) # Connect to the gateway gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=True) # Import the classes used by PySpark java_import(gateway.jvm, "org.apache.spark.SparkConf") java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") # TODO(davies): move into sql java_import(gateway.jvm, "org.apache.spark.sql.*") java_import(gateway.jvm, "org.apache.spark.sql.hive.*") java_import(gateway.jvm, "scala.Tuple2") return gateway
from pyspark.serializers import MarshalSerializer, PickleSerializer from time import sleep # for back compatibility from pyspark.sql import SQLContext, HiveContext, SchemaRDD, Row client = GatewayClient(port=int(sys.argv[1])) sparkVersion = sys.argv[2] if sparkVersion.startswith("1.4"): gateway = JavaGateway(client, auto_convert=True) else: gateway = JavaGateway(client) java_import(gateway.jvm, "org.apache.spark.SparkEnv") java_import(gateway.jvm, "org.apache.spark.SparkConf") java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") bridge = gateway.entry_point state = bridge.state() state.markReady() #jsc = bridge.javaSparkContext() if sparkVersion.startswith("1.2"): java_import(gateway.jvm, "org.apache.spark.sql.SQLContext") java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext") java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
# -*- coding: utf-8 -*- # nlg.py import random from datetime import datetime from py4j_server import launch_py4j_server from py4j.java_gateway import java_import gateway = launch_py4j_server() # Import the SimpleNLG classes java_import(gateway.jvm, "simplenlg.features.*") java_import(gateway.jvm, "simplenlg.realiser.*") # Define aliases so that we don't have to use the gateway.jvm prefix. NPPhraseSpec = gateway.jvm.NPPhraseSpec PPPhraseSpec = gateway.jvm.PPPhraseSpec SPhraseSpec = gateway.jvm.SPhraseSpec InterrogativeType = gateway.jvm.InterrogativeType Realiser = gateway.jvm.Realiser TextSpec = gateway.jvm.TextSpec Tense = gateway.jvm.Tense Form = gateway.jvm.Form date_endings = { "0": "0th", "1": "1st", "2": "2nd", "3": "3rd", "4": "4th", "5": "5th", "6": "6th",
def import_flink_view(gateway): """ import the classes used by PyFlink. :param gateway:gateway connected to JavaGateWayServer """ # Import the classes used by PyFlink java_import(gateway.jvm, "org.apache.flink.table.api.*") java_import(gateway.jvm, "org.apache.flink.table.api.java.*") java_import(gateway.jvm, "org.apache.flink.table.api.bridge.java.*") java_import(gateway.jvm, "org.apache.flink.table.api.dataview.*") java_import(gateway.jvm, "org.apache.flink.table.catalog.*") java_import(gateway.jvm, "org.apache.flink.table.descriptors.*") java_import(gateway.jvm, "org.apache.flink.table.descriptors.python.*") java_import(gateway.jvm, "org.apache.flink.table.expressions.*") java_import(gateway.jvm, "org.apache.flink.table.sources.*") java_import(gateway.jvm, "org.apache.flink.table.sinks.*") java_import(gateway.jvm, "org.apache.flink.table.sources.*") java_import(gateway.jvm, "org.apache.flink.table.types.*") java_import(gateway.jvm, "org.apache.flink.table.types.logical.*") java_import(gateway.jvm, "org.apache.flink.table.util.python.*") java_import(gateway.jvm, "org.apache.flink.api.common.python.*") java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.TypeInformation") java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.Types") java_import(gateway.jvm, "org.apache.flink.api.java.ExecutionEnvironment") java_import( gateway.jvm, "org.apache.flink.streaming.api.environment.StreamExecutionEnvironment" ) java_import( gateway.jvm, "org.apache.flink.api.common.restartstrategy.RestartStrategies") java_import(gateway.jvm, "org.apache.flink.python.util.PythonDependencyUtils") java_import(gateway.jvm, "org.apache.flink.python.PythonOptions") java_import(gateway.jvm, "org.apache.flink.client.python.PythonGatewayServer") java_import(gateway.jvm, "org.apache.flink.streaming.api.functions.python.*") java_import(gateway.jvm, "org.apache.flink.streaming.api.operators.python.*") java_import(gateway.jvm, "org.apache.flink.streaming.api.typeinfo.python.*")