def get_connection(self, host, port): """Open a socket connection to a given host and port and writes the Hadoop header The Hadoop RPC protocol looks like this when creating a connection: +---------------------------------------------------------------------+ | Header, 4 bytes ("hrpc") | +---------------------------------------------------------------------+ | Version, 1 byte (default verion 9) | +---------------------------------------------------------------------+ | RPC service class, 1 byte (0x00) | +---------------------------------------------------------------------+ | Auth protocol, 1 byte (Auth method None = 0) | +---------------------------------------------------------------------+ | Length of the RpcRequestHeaderProto + length of the | | of the IpcConnectionContextProto (4 bytes/32 bit int) | +---------------------------------------------------------------------+ | Serialized delimited RpcRequestHeaderProto | +---------------------------------------------------------------------+ | Serialized delimited IpcConnectionContextProto | +---------------------------------------------------------------------+ """ log.debug("############## CONNECTING ##############") auth = self.AUTH_PROTOCOL_NONE if self.token is None else self.AUTH_PROTOCOL_SASL # Open socket self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) self.sock.settimeout(self.timeout) # Connect socket to server - defined by host and port arguments self.sock.connect((host, port)) # Send RPC headers self.write(self.RPC_HEADER) # header self.write(struct.pack("B", self.version)) # version self.write(struct.pack("B", self.RPC_SERVICE_CLASS)) # RPC service class self.write(struct.pack("B", auth)) # serialization type (default none) if auth == SocketRpcChannel.AUTH_PROTOCOL_SASL: self.negotiate_sasl(self.token) self.call_id = -3 rpc_header = self.create_rpc_request_header() context = ( self.create_connection_context() if auth is self.AUTH_PROTOCOL_NONE else self.create_connection_context_auth() ) header_length = ( len(rpc_header) + encoder._VarintSize(len(rpc_header)) + len(context) + encoder._VarintSize(len(context)) ) if log.getEffectiveLevel() == logging.DEBUG: log.debug("Header length: %s (%s)" % (header_length, format_bytes(struct.pack("!I", header_length)))) self.write(struct.pack("!I", header_length)) self.write_delimited(rpc_header) self.write_delimited(context)
def get_connection(self, host, port): '''Open a socket connection to a given host and port and writes the Hadoop header The Hadoop RPC protocol looks like this when creating a connection: +---------------------------------------------------------------------+ | Header, 4 bytes ("hrpc") | +---------------------------------------------------------------------+ | Version, 1 byte (default verion 9) | +---------------------------------------------------------------------+ | RPC service class, 1 byte (0x00) | +---------------------------------------------------------------------+ | Auth protocol, 1 byte (Auth method None = 0) | +---------------------------------------------------------------------+ | Length of the RpcRequestHeaderProto + length of the | | of the IpcConnectionContextProto (4 bytes/32 bit int) | +---------------------------------------------------------------------+ | Serialized delimited RpcRequestHeaderProto | +---------------------------------------------------------------------+ | Serialized delimited IpcConnectionContextProto | +---------------------------------------------------------------------+ ''' log.debug("############## CONNECTING ##############") # Open socket self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) self.sock.settimeout(self.sock_connect_timeout / 1000) # Connect socket to server - defined by host and port arguments self.sock.connect((host, port)) self.sock.settimeout(self.sock_request_timeout / 1000) # Send RPC headers self.write(self.RPC_HEADER) # header self.write(struct.pack('B', self.version)) # version self.write(struct.pack('B', self.RPC_SERVICE_CLASS)) # RPC service class if self.use_sasl: self.write(struct.pack('B', self.AUTH_PROTOCOL_SASL)) # serialization type (protobuf = 0xDF) else: self.write(struct.pack('B', self.AUTH_PROTOCOL_NONE)) # serialization type (protobuf = 0) if self.use_sasl: sasl = SaslRpcClient(self, hdfs_namenode_principal=self.hdfs_namenode_principal) sasl_connected = sasl.connect() if not sasl_connected: raise TransientException("SASL is configured, but cannot get connected") rpc_header = self.create_rpc_request_header() context = self.create_connection_context() header_length = len(rpc_header) + encoder._VarintSize(len(rpc_header)) +len(context) + encoder._VarintSize(len(context)) if log.getEffectiveLevel() == logging.DEBUG: log.debug("Header length: %s (%s)" % (header_length, format_bytes(struct.pack('!I', header_length)))) self.write(struct.pack('!I', header_length)) self.write_delimited(rpc_header) self.write_delimited(context)
def send_rpc_message(self, method, request): '''Sends a Hadoop RPC request to the NameNode. The IpcConnectionContextProto, RpcPayloadHeaderProto and HadoopRpcRequestProto should already be serialized in the right way (delimited or not) before they are passed in this method. The Hadoop RPC protocol looks like this for sending requests: When sending requests +---------------------------------------------------------------------+ | Length of the next three parts (4 bytes/32 bit int) | +---------------------------------------------------------------------+ | Delimited serialized RpcRequestHeaderProto (varint len + header) | +---------------------------------------------------------------------+ | Delimited serialized RequestHeaderProto (varint len + header) | +---------------------------------------------------------------------+ | Delimited serialized Request (varint len + request) | +---------------------------------------------------------------------+ ''' log.debug("############## SENDING ##############") #0. RpcRequestHeaderProto rpc_request_header = self.create_rpc_request_header() #1. RequestHeaderProto request_header = self.create_request_header(method) #2. Param param = request.SerializeToString() if log.getEffectiveLevel() == logging.DEBUG: log_protobuf_message("Request", request) rpc_message_length = len(rpc_request_header) + encoder._VarintSize(len(rpc_request_header)) + \ len(request_header) + encoder._VarintSize(len(request_header)) + \ len(param) + encoder._VarintSize(len(param)) if log.getEffectiveLevel() == logging.DEBUG: log.debug("RPC message length: %s (%s)" % (rpc_message_length, format_bytes(struct.pack('!I', rpc_message_length)))) if self.use_sasl and self.sasl.use_wrap(): log.debug("SASL QOP requested, wrapping RPC message.") self.sasl.wrap( struct.pack('!I', rpc_message_length) + encoder._VarintBytes(len(rpc_request_header)) + rpc_request_header + encoder._VarintBytes(len(request_header)) + request_header + encoder._VarintBytes(len(param)) + param) else: self.write(struct.pack('!I', rpc_message_length)) self.write_delimited(rpc_request_header) self.write_delimited(request_header) self.write_delimited(param)
def send_rpc_message(self, method, request): """Sends a Hadoop RPC request to the NameNode. The IpcConnectionContextProto, RpcPayloadHeaderProto and HadoopRpcRequestProto should already be serialized in the right way (delimited or not) before they are passed in this method. The Hadoop RPC protocol looks like this for sending requests: When sending requests +---------------------------------------------------------------------+ | Length of the next three parts (4 bytes/32 bit int) | +---------------------------------------------------------------------+ | Delimited serialized RpcRequestHeaderProto (varint len + header) | +---------------------------------------------------------------------+ | Delimited serialized RequestHeaderProto (varint len + header) | +---------------------------------------------------------------------+ | Delimited serialized Request (varint len + request) | +---------------------------------------------------------------------+ """ log.debug("############## SENDING ##############") # 0. RpcRequestHeaderProto rpc_request_header = self.create_rpc_request_header() # 1. RequestHeaderProto request_header = self.create_request_header(method) # 2. Param param = request.SerializeToString() if log.getEffectiveLevel() == logging.DEBUG: log_protobuf_message("Request", request) rpc_message_length = ( len(rpc_request_header) + encoder._VarintSize(len(rpc_request_header)) + len(request_header) + encoder._VarintSize(len(request_header)) + len(param) + encoder._VarintSize(len(param)) ) if log.getEffectiveLevel() == logging.DEBUG: log.debug( "RPC message length: %s (%s)" % (rpc_message_length, format_bytes(struct.pack("!I", rpc_message_length))) ) self.write(struct.pack("!I", rpc_message_length)) self.write_delimited(rpc_request_header) self.write_delimited(request_header) self.write_delimited(param)
def create_rpc_request(self, method, request): '''Wraps the user's request in an HadoopRpcRequestProto message and serializes it delimited.''' s_request = request.SerializeToString() log_protobuf_message("Protobuf message", request) log.debug("Protobuf message bytes (%d): %s" % (len(s_request), format_bytes(s_request))) rpcRequest = hadoop_rpc.HadoopRpcRequestProto() rpcRequest.methodName = method.name rpcRequest.request = s_request rpcRequest.declaringClassProtocolName = "org.apache.hadoop.hdfs.protocol.ClientProtocol" rpcRequest.clientProtocolVersion = 1L # Serialize delimited s_rpcRequest = rpcRequest.SerializeToString() log_protobuf_message("RpcRequest (len: %d)" % len(s_rpcRequest), rpcRequest) return encoder._VarintBytes(len(s_rpcRequest)) + s_rpcRequest
def get_delimited_message_bytes(byte_stream): ''' Parse a delimited protobuf message. This is done by first getting a protobuf varint from the stream that represents the length of the message, then reading that amount of from the message and then parse it. Since the int can be represented as max 4 bytes, first get 4 bytes and try to decode. The decoder returns the value and the position where the value was found, so we need to rewind the buffer to the position, because the remaining bytes belong to the message after. ''' (length, pos) = decoder._DecodeVarint32(byte_stream.read(4), 0) log.debug("Delimited message length (pos %d): %d" % (pos, length)) byte_stream.rewind(4 - pos) message_bytes = byte_stream.read(length) log.debug("Delimited message bytes (%d): %s" % (len(message_bytes), format_bytes(message_bytes))) return message_bytes
def get_delimited_message_bytes(byte_stream, nr=4): ''' Parse a delimited protobuf message. This is done by first getting a protobuf varint from the stream that represents the length of the message, then reading that amount of from the message and then parse it. Since the int can be represented as max 4 bytes, first get 4 bytes and try to decode. The decoder returns the value and the position where the value was found, so we need to rewind the buffer to the position, because the remaining bytes belong to the message after. ''' (length, pos) = decoder._DecodeVarint32(byte_stream.read(nr), 0) if log.getEffectiveLevel() == logging.DEBUG: log.debug("Delimited message length (pos %d): %d" % (pos, length)) delimiter_bytes = nr - pos byte_stream.rewind(delimiter_bytes) message_bytes = byte_stream.read(length) if log.getEffectiveLevel() == logging.DEBUG: log.debug("Delimited message bytes (%d): %s" % (len(message_bytes), format_bytes(message_bytes))) total_len = length + pos return (total_len, message_bytes)
def parse_response(self, byte_stream, response_class): '''Parses a Hadoop RPC response. The RpcResponseHeaderProto contains a status field that marks SUCCESS or ERROR. The Hadoop RPC protocol looks like the diagram below for receiving SUCCESS requests. +-----------------------------------------------------------+ | Delimited serialized RpcResponseHeaderProto | +-----------------------------------------------------------+ | Length of the RPC resonse (4 bytes/32 bit int) | +-----------------------------------------------------------+ | Serialized RPC response | +-----------------------------------------------------------+ The Hadoop RPC protocol looks like the diagram below for receiving ERROR requests. +-----------------------------------------------------------+ | Delimited serialized RpcResponseHeaderProto | +-----------------------------------------------------------+ | Length of the RPC resonse (4 bytes/32 bit int) | +-----------------------------------------------------------+ | Length of the Exeption class name (4 bytes/32 bit int) | +-----------------------------------------------------------+ | Exception class name string | +-----------------------------------------------------------+ | Length of the stack trace (4 bytes/32 bit int) | +-----------------------------------------------------------+ | Stack trace string | +-----------------------------------------------------------+ If the length of the strings is -1, the strings are null ''' log.debug("############## PARSING ##############") log.debug("Payload class: %s" % response_class) # Let's see if we deal with an error on protocol level check = struct.unpack("!Q", byte_stream.read(8))[0] if check == self.ERROR_BYTES: self.handle_error(byte_stream) byte_stream.rewind(8) log.debug("---- Parsing header ----") header_bytes = get_delimited_message_bytes(byte_stream) header = rpcheaderproto.RpcResponseHeaderProto() header.ParseFromString(header_bytes) log_protobuf_message("Response header", header) if header.status == 0: # rpcheaderproto.RpcStatusProto.Value('SUCCESS') log.debug("---- Parsing response ----") response = response_class() response_length = self.get_length(byte_stream) if response_length == 0: return response_bytes = byte_stream.read(response_length) log.debug("Response bytes (%d): %s" % (len(response_bytes), format_bytes(response_bytes))) response.ParseFromString(response_bytes) log_protobuf_message("Response", response) return response elif header.status == 1: # rpcheaderproto.RpcStatusProto.Value('ERROR') self.handle_error(byte_stream)
def write(self, data): if log.getEffectiveLevel() == logging.DEBUG: log.debug("Sending: %s", format_bytes(data)) self.sock.send(data)
def parse_response(self, byte_stream, response_class): '''Parses a Hadoop RPC response. The RpcResponseHeaderProto contains a status field that marks SUCCESS or ERROR. The Hadoop RPC protocol looks like the diagram below for receiving SUCCESS requests. +-----------------------------------------------------------+ | Delimited serialized RpcResponseHeaderProto | +-----------------------------------------------------------+ | Length of the RPC resonse (4 bytes/32 bit int) | +-----------------------------------------------------------+ | Serialized RPC response | +-----------------------------------------------------------+ The Hadoop RPC protocol looks like the diagram below for receiving ERROR requests. +-----------------------------------------------------------+ | Delimited serialized RpcResponseHeaderProto | +-----------------------------------------------------------+ | Length of the RPC resonse (4 bytes/32 bit int) | +-----------------------------------------------------------+ | Length of the Exeption class name (4 bytes/32 bit int) | +-----------------------------------------------------------+ | Exception class name string | +-----------------------------------------------------------+ | Length of the stack trace (4 bytes/32 bit int) | +-----------------------------------------------------------+ | Stack trace string | +-----------------------------------------------------------+ If the lenght of the strings is -1, the strings are null ''' log.debug("############## PARSING ##############") log.debug("Payload class: %s" % response_class) log.debug("---- Parsing header ----") header_bytes = self.get_delimited_nessage_bytes(byte_stream) header = rpcheaderproto.RpcResponseHeaderProto() header.ParseFromString(header_bytes) self.log_protobuf_message("Response header", header) if header.status == 0: # rpcheaderproto.RpcStatusProto.Value('SUCCESS') log.debug("---- Parsing response ----") response = response_class() response_length = self.get_length(byte_stream) if response_length == 0: return response_bytes = byte_stream.read(response_length) log.debug("Response bytes (%d): %s" % (len(response_bytes), format_bytes(response_bytes))) response.ParseFromString(response_bytes) self.log_protobuf_message("Response", response) return response elif header.status == 1: # rpcheaderproto.RpcStatusProto.Value('ERROR') length = self.get_length(byte_stream) log.debug("Class name length: %d" % (length)) if length == -1: class_name = None else: class_name = byte_stream.read(length) log.debug("Class name (%d): %s" % (len(class_name), class_name)) length = self.get_length(byte_stream) log.debug("Stack trace length: %d" % (length)) if length == -1: stack_trace = None else: stack_trace = byte_stream.read(length) log.debug("Stack trace (%d): %s" % (len(stack_trace), stack_trace)) stack_trace_msg = stack_trace.split("\n")[0] log.debug(stack_trace_msg) raise RequestError(stack_trace_msg)