def get_conn(self): ''' Returns a snakebite HDFSClient object. ''' connections = self.get_connections(self.hdfs_conn_id) use_sasl = False if configuration.get('core', 'security') == 'kerberos': use_sasl = True client = None ''' When using HAClient, proxy_user must be the same, so is ok to always take the first ''' effective_user = self.proxy_user or connections[0].login if len(connections) == 1: autoconfig = connections[0].extra_dejson.get('autoconfig', False) if autoconfig: client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl) else: client = Client(connections[0].host, connections[0].port, effective_user=effective_user, use_sasl=use_sasl) elif len(connections) > 1: nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn, effective_user=effective_user, use_sasl=use_sasl) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def get_conn(self): ''' Returns a snakebite HDFSClient object. ''' use_sasl = False securityConfig = None if securityConfig == 'kerberos': # TODO make confugration file for thiw use_sasl = True connections = self.get_connections(self.hdfs_conn_id) client = None # When using HAClient, proxy_user must be the same, so is ok to always take the first effective_user = self.proxy_user or connections[0].login if len(connections) == 1: client = Client(connections[0].host, connections[0].port, use_sasl=use_sasl, effective_user=effective_user) elif len(connections) > 1: nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn, use_sasl=use_sasl, effective_user=effective_user) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def test_ha_client_socket_timeout(self): e = socket.timeout mocked_client_cat = Mock(side_effect=e) ha_client = HAClient([Namenode("foo"), Namenode("bar")]) ha_client.cat = HAClient._ha_gen_method(mocked_client_cat) cat_result_gen = ha_client.cat(ha_client, ['foobar']) self.assertRaises(OutOfNNException, all, cat_result_gen)
def test_ha_client_standby_errror(self): e = RequestError("org.apache.hadoop.ipc.StandbyException foo bar") mocked_client_cat = Mock(side_effect=e) ha_client = HAClient([Namenode("foo"), Namenode("bar")]) ha_client.cat = HAClient._ha_gen_method(mocked_client_cat) cat_result_gen = ha_client.cat(ha_client, ['foobar']) self.assertRaises(OutOfNNException, all, cat_result_gen)
def test_ha_client_ehostunreach_socket_error(self): e = socket.error e.errno = errno.EHOSTUNREACH mocked_client_cat = Mock(side_effect=e) ha_client = HAClient([Namenode("foo"), Namenode("bar")]) ha_client.cat = HAClient._ha_gen_method(mocked_client_cat) cat_result_gen = ha_client.cat(ha_client, ['foobar']) self.assertRaises(OutOfNNException, all, cat_result_gen)
def test_ha_client_econnrefused_socket_error(self): e = SocketError e.errno = errno.ECONNREFUSED mocked_client_cat = Mock(side_effect=e) ha_client = HAClient([Namenode("foo"), Namenode("bar")]) ha_client.cat = HAClient._ha_gen_method(mocked_client_cat) cat_result_gen = ha_client.cat(ha_client, ['foobar']) self.assertRaises(OutOfNNException, all, cat_result_gen)
def test_ha_client_retry(self, rpc_call): retry_attempts = 3 e = RequestError("org.apache.hadoop.ipc.RetriableException foo bar") rpc_call.side_effect = e nns = [Namenode("foo"), Namenode("bar")] ha_client = HAClient(nns, max_retries=retry_attempts) cat_result_gen = ha_client.cat(['foobar']) self.assertRaises(RequestError, all, cat_result_gen) self.assertEquals(rpc_call.call_count, 1 + retry_attempts)
def test_ha_client_failover_retry_for_exception(self, rpc_call): failover_attempts = 3 e = RequestError("org.apache.hadoop.ipc.StandbyException foo bar") rpc_call.side_effect = e nns = [Namenode("foo", 8020), Namenode("bar", 8020)] ha_client = HAClient(nns, max_failovers=failover_attempts) cat_result_gen = ha_client.cat(['foobar']) self.assertRaises(OutOfNNException, all, cat_result_gen) self.assertEquals(rpc_call.call_count, 1 + failover_attempts)
def setup_client(self): if 'skiptrash' in self.args: use_trash = self.args.usetrash and not self.args.skiptrash else: use_trash = self.args.usetrash self.client = HAClient(self.namenodes, use_trash, self.user, self.use_sasl, self.configs['hdfs_namenode_principal'], self.configs['failover_max_attempts'], self.configs['client_retries'], self.configs['client_sleep_base_millis'], self.configs['client_sleep_max_millis'], self.configs['socket_timeout_millis'], use_datanode_hostname=self.configs['use_datanode_hostname'])
def test_response_error_no_client_retry(self, rpc_call): retry_attempts = 3 e = RpcResponseError("Response read error") rpc_call.side_effect = e nns = [Namenode("foo")] ha_client = HAClient(nns, max_retries=retry_attempts) cat_result_gen = ha_client.rename(['foobar'], 'foo') self.assertRaises(RpcResponseError, all, cat_result_gen) self.assertEquals(rpc_call.call_count, 1)
def monitor_db_size(): try: #connect to namenodeHA service with connect timeout setting and request timeout setting client = HAClient([n1, n2], use_trash=True, sock_connect_timeout=50000, sock_request_timeout=50000) except Exception, ex: pass
def test_ha_client_failover_retry_for_exception2(self, get_connection): failover_attempts = 2 e = RequestError("org.apache.hadoop.ipc.StandbyException foo bar") get_connection.side_effect = e nns = [Namenode("foo"), Namenode("bar")] ha_client = HAClient(nns, max_failovers=failover_attempts) cat_result_gen = ha_client.cat(['foobar']) self.assertRaises(OutOfNNException, all, cat_result_gen) calls = [call("foo", 8020), call("bar", 8020), call("foo", 8020)] get_connection.assert_has_calls(calls)
def test_ha_client_failover_retry(self, rpc_call): failover_attempts = 3 e = socket.timeout e.message = "socket.timeout" rpc_call.side_effect = e nns = [Namenode("foo"), Namenode("bar")] ha_client = HAClient(nns, max_failovers=failover_attempts) cat_result_gen = ha_client.cat(['foobar']) self.assertRaises(OutOfNNException, all, cat_result_gen) self.assertEquals(rpc_call.call_count, 1 + failover_attempts)
def test_ha_client_retry2(self, get_connection): retry_attempts = 2 e = RequestError("org.apache.hadoop.ipc.RetriableException foo bar") get_connection.side_effect = e nns = [Namenode("foo", 8020), Namenode("bar", 8020)] ha_client = HAClient(nns, max_retries=retry_attempts) cat_result_gen = ha_client.cat(['foobar']) self.assertRaises(RequestError, all, cat_result_gen) calls = [call("foo", 8020), call("foo", 8020), call("foo", 8020)] get_connection.assert_has_calls(calls)
def test_ha_client_failover_retry2(self, get_connection): failover_attempts = 2 e = socket.timeout e.message = "socket.timeout" get_connection.side_effect = e nns = [Namenode("foo", 8020), Namenode("bar", 8020)] ha_client = HAClient(nns, max_failovers=failover_attempts) cat_result_gen = ha_client.cat(['foobar']) self.assertRaises(OutOfNNException, all, cat_result_gen) calls = [call("foo", 8020), call("bar", 8020), call("foo", 8020)] get_connection.assert_has_calls(calls)
def get_snakebite_hdfs_client(): """ 获得 snakebite库的HDFS Client :return: snakebite HDFS Client """ n1 = Namenode("hadoop101", 9000) n2 = Namenode("hadoop102", 9000) client = HAClient([n1, n2], effective_user="******", sock_request_timeout=10000000000) return client
def __init__(self, filepath, hdfs_conn_id='hdfs_default', *args, **kwargs): super(HdfsSensor, self).__init__(*args, **kwargs) self.filepath = filepath session = settings.Session() db = session.query(DB).filter(DB.conn_id == hdfs_conn_id).first() if not db: raise Exception("conn_id doesn't exist in the repository") self.host = db.host self.port = db.port NAMENODES = [Namenode(self.host, self.port)] self.sb = HAClient(NAMENODES) session.commit() session.close()
def get_conn(self) -> Any: """ Returns a snakebite HDFSClient object. """ # When using HAClient, proxy_user must be the same, so is ok to always # take the first. effective_user = self.proxy_user autoconfig = self.autoconfig use_sasl = conf.get('core', 'security') == 'kerberos' try: connections = self.get_connections(self.hdfs_conn_id) if not effective_user: effective_user = connections[0].login if not autoconfig: autoconfig = connections[0].extra_dejson.get( 'autoconfig', False) hdfs_namenode_principal = connections[0].extra_dejson.get( 'hdfs_namenode_principal') except AirflowException: if not autoconfig: raise if autoconfig: # will read config info from $HADOOP_HOME conf files client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl) elif len(connections) == 1: client = Client( connections[0].host, connections[0].port, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal, ) elif len(connections) > 1: name_node = [ Namenode(conn.host, conn.port) for conn in connections ] client = HAClient( name_node, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal, ) else: raise HDFSHookException("conn_id doesn't exist in the repository " "and autoconfig is not specified") return client
def get_conn(self): ''' Returns a snakebite HDFSClient object. ''' connections = self.get_connections(self.hdfs_conn_id) client = None if len(connections) == 1: client = Client(connections[0].host, connections[0].port) elif len(connections) > 1: nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def __init__(self, namenode, path, use_trash=False, effective_user=None, use_sasl=True, hdfs_namenode_principal='hdfs', use_datanode_hostname=False): from snakebite.client import HAClient from snakebite.namenode import Namenode self.path = path namenodes = [Namenode(namenode)] self._client = HAClient( namenodes, use_trash=use_trash, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal, use_datanode_hostname=use_datanode_hostname)
def get_conn(self): ''' Returns a snakebite HDFSClient object. ''' use_sasl = False if conf.get('core', 'security') == 'kerberos': use_sasl = True connections = self.get_connections(self.hdfs_conn_id) client = None if len(connections) == 1: client = Client(connections[0].host, connections[0].port, use_sasl=use_sasl) elif len(connections) > 1: nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn, use_sasl=use_sasl) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def __create_hdfs_client__(): try: namenode_conf = os.path.dirname( os.path.abspath(__file__)) + '/../conf/namenode.conf' config_dict = config_parse.config_parse(namenode_conf) if 'namenode' not in config_dict or 'host' not in config_dict['namenode'] or \ 'port' not in config_dict['namenode'] or 'second_namenode' not in config_dict or \ 'host' not in config_dict['second_namenode'] or 'port' not in config_dict['second_namenode']: logger.error('namenode config file:[%s] invalid' % namenode_conf) sys.exit(2) namenode_host = config_dict['namenode']['host'] namenode_port = int(config_dict['namenode']['port']) second_namenode_host = config_dict['second_namenode']['host'] second_namenode_port = int(config_dict['second_namenode']['port']) namenode = Namenode(namenode_host, namenode_port) second_namenode = Namenode(second_namenode_host, second_namenode_port) return HAClient([namenode, second_namenode], use_trash=True) except Exception, e: logger.error('create hdfs client exception:[%s]' % str(e)) sys.exit(2)
def ha_test(): n1 = Namenode("192.168.24.137", 9990) n2 = Namenode("192.168.24.138", 9990) client = HAClient([n1, n2]) for x in client.ls(['/']): print x
import time from snakebite.client import HAClient from snakebite.namenode import Namenode n1 = Namenode("namenode-1", 8022) n2 = Namenode("namenode-2", 8022) #get the timestamp of now now = time.time() #get the timestamp of 30 days ago thirty_day_ago = now - 30 * 24 * 60 * 60 #get the time stamp of 30 days ago with ms timestamp millis_new = int(round(thirty_day_ago * 1000)) #print millis_new #connect to the HA client of HDFS namenodes client = HAClient([n1, n2], use_trash=True, sock_connect_timeout=50000, sock_request_timeout=50000) for file in client.ls(["/user/spark/applicationHistory/"]): file_timestamp = file['access_time'] file_path = file['path'] print file_path if file_timestamp < millis_new: for p in client.delete([file_path], recurse=True): print p