def run(self): gateway = self._initialize_gateway(self.gateway_address) if not gateway: log_error('Failed to initialize java gateway') return # noinspection PyProtectedMember callback_server_port = gateway._callback_server.server_socket.getsockname()[1] spark_context, sql_context = self._initialize_spark_contexts(gateway) code_executor = CodeExecutor(spark_context, sql_context, gateway.entry_point) try: gateway.entry_point.registerCallbackServerPort(callback_server_port) gateway.entry_point.registerCodeExecutor(code_executor) except Py4JError as e: log_error('Exception while registering codeExecutor, or callback server port: {}'.format(e)) gateway.close() return # Wait for the end of the world try: while True: time.sleep(1) except KeyboardInterrupt: log_debug('Exiting on user\'s request') gateway.close()
def _initialize_gateway(gateway_address): (host, port) = gateway_address callback_params = CallbackServerParameters(address=host, port=0) gateway = JavaGateway(GatewayClient(address=host, port=port), start_callback_server=True, auto_convert=True, callback_server_parameters=callback_params) try: java_import(gateway.jvm, "org.apache.spark.SparkEnv") java_import(gateway.jvm, "org.apache.spark.SparkConf") java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") java_import(gateway.jvm, "org.apache.spark.sql.*") java_import(gateway.jvm, "org.apache.spark.sql.hive.*") java_import(gateway.jvm, "scala.Tuple2") java_import(gateway.jvm, "scala.collection.immutable.List") except Py4JError as e: log_error('Error while initializing java gateway: {}'.format(e)) gateway.close() return None log_debug('Java Gateway initialized {}'.format(gateway)) return gateway
def run(self): gateway = self._initialize_gateway(self.gateway_address) if not gateway: log_error('Failed to initialize java gateway') return # noinspection PyProtectedMember callback_server_port = gateway._callback_server.server_socket.getsockname( )[1] spark_context, spark_session = self._initialize_spark_contexts(gateway) code_executor = CodeExecutor(spark_context, spark_session, gateway.entry_point) try: gateway.entry_point.registerCallbackServerPort( callback_server_port) gateway.entry_point.registerCodeExecutor(code_executor) except Py4JError as e: log_error( 'Exception while registering codeExecutor, or callback server port: {}' .format(e)) gateway.close() return # Wait for the end of the world try: while True: time.sleep(1) except KeyboardInterrupt: log_debug('Exiting on user\'s request') gateway.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gateway-address', action='store') args = parser.parse_args() gateway_address = args.gateway_address.split(':') gateway_address = (gateway_address[0], int(gateway_address[1])) log_debug('Initializing PyExecutor at {}'.format(gateway_address)) py_executor = PyExecutor(gateway_address=gateway_address) py_executor.run() log_debug('PyExecutor ended!')
def isValid(self, custom_operation_code): def is_transform_function(field): return (isinstance(field, ast.FunctionDef) and field.name == self.TRANSFORM_FUNCTION_NAME and len( field.args.args) in self.TRANSFORM_FUNCTION_ARITIES) try: parsed = ast.parse(custom_operation_code) except SyntaxError: return False is_valid = any(filter(is_transform_function, parsed.body)) log_debug('Valid code? {}: {}'.format(is_valid, custom_operation_code)) return is_valid
def _run_custom_code(self, workflow_id, node_id, custom_operation_code): """ :param workflow_id: :param node_id: id of node of the DOperation associated with the custom code :param custom_operation_code: The code is expected to include a top-level definition of a function named according to TRANSFORM_FUNCTION_NAME value :return: None """ # This should've been checked before running assert self.isValid(custom_operation_code) new_spark_session = self.spark_sql_session.newSession() new_sql_context = None spark_version = self.spark_context.version if spark_version in [ "2.0.0", "2.0.1", "2.0.2", "2.1.0", "2.1.1", "2.2.0" ]: new_sql_context = SQLContext(self.spark_context, new_spark_session) else: log_debug( "Spark version {} is not supported".format(spark_version)) raise ValueError( "Spark version {} is not supported".format(spark_version)) raw_input_data_frame = DataFrame( jdf=self.entry_point.retrieveInputDataFrame( workflow_id, node_id, CodeExecutor.INPUT_PORT_NUMBER), sql_ctx=new_sql_context) input_data_frame = new_spark_session.createDataFrame( raw_input_data_frame.rdd) context = { 'sc': self.spark_context, 'spark': new_spark_session, 'sqlContext': new_sql_context } log_debug('executing code... {}\n'.format(context)) try: exec(custom_operation_code, context) except ImportError as e: log_debug('ImportError!!! ==> {}\n'.format(e.msg)) raise Exception('ImportError!!! ==> {}\n'.format(e.msg)) log_debug('FINISH\n') output_data = context[self.TRANSFORM_FUNCTION_NAME](input_data_frame) try: output_data_frame = self._convert_data_to_data_frame(output_data) except: log_debug('Operation returned {} instead of a DataFrame'.format(output_data) + \ ' (or pandas.DataFrame, single value, tuple/list of single values,' + \ ' tuple/list of tuples/lists of single values) (pandas library available: ' + \ str(self.is_pandas_available) + ').') raise Exception('Operation returned {} instead of a DataFrame'.format(output_data) + \ ' (or pandas.DataFrame, single value, tuple/list of single values,' + \ ' tuple/list of tuples/lists of single values) (pandas library available: ' + \ str(self.is_pandas_available) + ').') # noinspection PyProtectedMember self.entry_point.registerOutputDataFrame( workflow_id, node_id, CodeExecutor.OUTPUT_PORT_NUMBER, output_data_frame._jdf)