def run(self):
        gateway = self._initialize_gateway(self.gateway_address)
        if not gateway:
            log_error('Failed to initialize java gateway')
            return

        # noinspection PyProtectedMember
        callback_server_port = gateway._callback_server.server_socket.getsockname()[1]
        spark_context, sql_context = self._initialize_spark_contexts(gateway)
        code_executor = CodeExecutor(spark_context, sql_context, gateway.entry_point)

        try:
            gateway.entry_point.registerCallbackServerPort(callback_server_port)
            gateway.entry_point.registerCodeExecutor(code_executor)
        except Py4JError as e:
            log_error('Exception while registering codeExecutor, or callback server port: {}'.format(e))
            gateway.close()
            return

        # Wait for the end of the world
        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            log_debug('Exiting on user\'s request')
            gateway.close()
예제 #2
0
    def _initialize_gateway(gateway_address):
        (host, port) = gateway_address

        callback_params = CallbackServerParameters(address=host, port=0)

        gateway = JavaGateway(GatewayClient(address=host, port=port),
                              start_callback_server=True,
                              auto_convert=True,
                              callback_server_parameters=callback_params)
        try:
            java_import(gateway.jvm, "org.apache.spark.SparkEnv")
            java_import(gateway.jvm, "org.apache.spark.SparkConf")
            java_import(gateway.jvm, "org.apache.spark.api.java.*")
            java_import(gateway.jvm, "org.apache.spark.api.python.*")
            java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
            java_import(gateway.jvm, "org.apache.spark.sql.*")
            java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
            java_import(gateway.jvm, "scala.Tuple2")
            java_import(gateway.jvm, "scala.collection.immutable.List")
        except Py4JError as e:
            log_error('Error while initializing java gateway: {}'.format(e))
            gateway.close()
            return None

        log_debug('Java Gateway initialized {}'.format(gateway))
        return gateway
    def run(self):
        gateway = self._initialize_gateway(self.gateway_address)
        if not gateway:
            log_error('Failed to initialize java gateway')
            return

        # noinspection PyProtectedMember
        callback_server_port = gateway._callback_server.server_socket.getsockname(
        )[1]
        spark_context, spark_session = self._initialize_spark_contexts(gateway)
        code_executor = CodeExecutor(spark_context, spark_session,
                                     gateway.entry_point)

        try:
            gateway.entry_point.registerCallbackServerPort(
                callback_server_port)
            gateway.entry_point.registerCodeExecutor(code_executor)
        except Py4JError as e:
            log_error(
                'Exception while registering codeExecutor, or callback server port: {}'
                .format(e))
            gateway.close()
            return

        # Wait for the end of the world
        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            log_debug('Exiting on user\'s request')
            gateway.close()
예제 #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gateway-address', action='store')
    args = parser.parse_args()

    gateway_address = args.gateway_address.split(':')
    gateway_address = (gateway_address[0], int(gateway_address[1]))

    log_debug('Initializing PyExecutor at {}'.format(gateway_address))
    py_executor = PyExecutor(gateway_address=gateway_address)
    py_executor.run()
    log_debug('PyExecutor ended!')
예제 #5
0
    def isValid(self, custom_operation_code):
        def is_transform_function(field):
            return (isinstance(field, ast.FunctionDef)
                    and field.name == self.TRANSFORM_FUNCTION_NAME and len(
                        field.args.args) in self.TRANSFORM_FUNCTION_ARITIES)

        try:
            parsed = ast.parse(custom_operation_code)
        except SyntaxError:
            return False

        is_valid = any(filter(is_transform_function, parsed.body))
        log_debug('Valid code? {}: {}'.format(is_valid, custom_operation_code))
        return is_valid
예제 #6
0
    def _run_custom_code(self, workflow_id, node_id, custom_operation_code):
        """
        :param workflow_id:
        :param node_id: id of node of the DOperation associated with the custom code
        :param custom_operation_code: The code is expected to include a top-level definition
        of a function named according to TRANSFORM_FUNCTION_NAME value
        :return: None
        """

        # This should've been checked before running
        assert self.isValid(custom_operation_code)

        new_spark_session = self.spark_sql_session.newSession()

        new_sql_context = None
        spark_version = self.spark_context.version
        if spark_version in [
                "2.0.0", "2.0.1", "2.0.2", "2.1.0", "2.1.1", "2.2.0"
        ]:
            new_sql_context = SQLContext(self.spark_context, new_spark_session)
        else:
            log_debug(
                "Spark version {} is not supported".format(spark_version))
            raise ValueError(
                "Spark version {} is not supported".format(spark_version))

        raw_input_data_frame = DataFrame(
            jdf=self.entry_point.retrieveInputDataFrame(
                workflow_id, node_id, CodeExecutor.INPUT_PORT_NUMBER),
            sql_ctx=new_sql_context)
        input_data_frame = new_spark_session.createDataFrame(
            raw_input_data_frame.rdd)

        context = {
            'sc': self.spark_context,
            'spark': new_spark_session,
            'sqlContext': new_sql_context
        }

        log_debug('executing code... {}\n'.format(context))
        try:
            exec(custom_operation_code, context)
        except ImportError as e:
            log_debug('ImportError!!! ==> {}\n'.format(e.msg))
            raise Exception('ImportError!!! ==> {}\n'.format(e.msg))
        log_debug('FINISH\n')

        output_data = context[self.TRANSFORM_FUNCTION_NAME](input_data_frame)
        try:
            output_data_frame = self._convert_data_to_data_frame(output_data)
        except:
            log_debug('Operation returned {} instead of a DataFrame'.format(output_data) + \
                ' (or pandas.DataFrame, single value, tuple/list of single values,' + \
                ' tuple/list of tuples/lists of single values) (pandas library available: ' + \
                str(self.is_pandas_available) + ').')
            raise Exception('Operation returned {} instead of a DataFrame'.format(output_data) + \
                ' (or pandas.DataFrame, single value, tuple/list of single values,' + \
                ' tuple/list of tuples/lists of single values) (pandas library available: ' + \
                str(self.is_pandas_available) + ').')

        # noinspection PyProtectedMember
        self.entry_point.registerOutputDataFrame(
            workflow_id, node_id, CodeExecutor.OUTPUT_PORT_NUMBER,
            output_data_frame._jdf)