def test_invalidate(self): with patch('impala.dbms.ImpalaDbms._get_different_tables' ) as get_different_tables: with patch( 'desktop.models.ClusterConfig.get_hive_metastore_interpreters' ) as get_hive_metastore_interpreters: ddms = ImpalaDbms(Mock(query_server={'server_name': ''}), None) get_different_tables.return_value = ['customers'] get_hive_metastore_interpreters.return_value = [] assert_raises(PopupException, ddms.invalidate, 'default') # No hive/metastore configured get_hive_metastore_interpreters.return_value = ['hive'] ddms.invalidate('default') ddms.client.query.assert_called_once_with( ddms.client.query.call_args[0][0]) assert_true('customers' in ddms.client.query.call_args[0] [0].hql_query) # diff of 1 table get_different_tables.return_value = [ 'customers', '', '', '', '', '', '', '', '', '', '' ] assert_raises(PopupException, ddms.invalidate, 'default') # diff of 11 tables. Limit is 10. ddms.invalidate('default', 'customers') assert_true(ddms.client.query.call_count == 2) # Second call assert_true('customers' in ddms.client.query.call_args[0] [0].hql_query) # invalidate 1 table ddms.invalidate() assert_true(ddms.client.query.call_count == 3) # Third call assert_true('customers' not in ddms.client.query.call_args[0] [0].hql_query) # Full invalidate
def get(user, query_server=None, cluster=None): global DBMS_CACHE global DBMS_CACHE_LOCK if query_server is None: query_server = get_query_server_config(cluster=cluster) DBMS_CACHE_LOCK.acquire() try: DBMS_CACHE.setdefault(user.username, {}) if query_server['server_name'] not in DBMS_CACHE[user.username]: # Avoid circular dependency from beeswax.server.hive_server2_lib import HiveServerClientCompatible if query_server['server_name'] == 'impala': from impala.dbms import ImpalaDbms from impala.server import ImpalaServerClient DBMS_CACHE[user.username][query_server['server_name']] = ImpalaDbms(HiveServerClientCompatible(ImpalaServerClient(query_server, user)), QueryHistory.SERVER_TYPE[1][0]) else: from beeswax.server.hive_server2_lib import HiveServerClient DBMS_CACHE[user.username][query_server['server_name']] = HiveServer2Dbms(HiveServerClientCompatible(HiveServerClient(query_server, user)), QueryHistory.SERVER_TYPE[1][0]) return DBMS_CACHE[user.username][query_server['server_name']] finally: DBMS_CACHE_LOCK.release()
def get_sample(self, database, table, column=None, nested=None, limit=100, generate_sql_only=False): result = None hql = None # Filter on max # of partitions for partitioned tables column = '`%s`' % column if column else '*' if table.partition_keys: hql = self._get_sample_partition_query(database, table, column, limit) elif self.server_name == 'impala': if column or nested: from impala.dbms import ImpalaDbms select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s;' % (select_clause, from_clause, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s;" % (database, table.name, limit) else: hql = "SELECT %s FROM `%s`.`%s` LIMIT %s;" % (column, database, table.name, limit) # TODO: Add nested select support for HS2 if hql: if generate_sql_only: return hql else: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def get_sample(self, database, table, column=None, nested=None, limit=100): result = None hql = None # Filter on max # of partitions for partitioned tables column = '`%s`' % column if column else '*' if table.partition_keys: hql = self._get_sample_partition_query(database, table, column, limit) elif self.server_name == 'impala': if column or nested: from impala.dbms import ImpalaDbms select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s;' % (select_clause, from_clause, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s;" % (database, table.name, limit) else: hql = "SELECT %s FROM `%s`.`%s` LIMIT %s;" % (column, database, table.name, limit) # TODO: Add nested select support for HS2 if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def get_sample(self, database, table, column=None, nested=None): result = None hql = None if not table.is_view: limit = min(100, BROWSE_PARTITIONED_TABLE_LIMIT.get()) if column or nested: # Could do column for any type, then nested with partitions if self.server_name == 'impala': select_clause, from_clause = ImpalaDbms.get_nested_select( database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s' % (select_clause, from_clause, limit) else: if table.partition_keys: # Filter on max # of partitions for partitioned tables hql = self._get_sample_partition_query( database, table, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s" % ( database, table.name, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def get_sample(self, database, table, column=None, nested=None): result = None hql = None limit = 100 if column or nested: # Could do column for any type, then nested with partitions if self.server_name == 'impala': from impala.dbms import ImpalaDbms select_clause, from_clause = ImpalaDbms.get_nested_select( database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s' % (select_clause, from_clause, limit) else: # Filter on max # of partitions for partitioned tables # Impala's SHOW PARTITIONS is different from Hive, so we only support Hive for now if self.server_name != 'impala' and table.partition_keys: hql = self._get_sample_partition_query(database, table, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s" % (database, table.name, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def get_sample(self, database, table, column=None, nested=None): result = None hql = None if not table.is_view: limit = min(100, BROWSE_PARTITIONED_TABLE_LIMIT.get()) if column or nested: # Could do column for any type, then nested with partitions if self.server_name == 'impala': select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s' % (select_clause, from_clause, limit) else: if table.partition_keys: # Filter on max # of partitions for partitioned tables hql = self._get_sample_partition_query(database, table, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s" % (database, table.name, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def get_sample(self, database, table, column=None, nested=None): result = None hql = None limit = 100 if column or nested: # Could do column for any type, then nested with partitions if self.server_name == 'impala': from impala.dbms import ImpalaDbms select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s' % (select_clause, from_clause, limit) else: # Filter on max # of partitions for partitioned tables # Impala's SHOW PARTITIONS is different from Hive, so we only support Hive for now if self.server_name != 'impala' and table.partition_keys: hql = self._get_sample_partition_query(database, table, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s" % (database, table.name, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def get_sample(self, database, table, column=None, nested=None): result = None hql = None if not table.is_view: limit = min(100, BROWSE_PARTITIONED_TABLE_LIMIT.get()) if column or nested: # Could do column for any type, then nested with partitions if self.server_name == 'impala': select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s' % (select_clause, from_clause, limit) else: partition_query = "" if table.partition_keys: partitions = self.get_partitions(database, table, partition_spec=None, max_parts=1) partition_query = 'WHERE ' + ' AND '.join(["%s='%s'" % (table.partition_keys[idx].name, key) for idx, key in enumerate(partitions[0].values)]) hql = "SELECT * FROM `%s`.`%s` %s LIMIT %s" % (database, table.name, partition_query, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def test_get_impala_nested_select(self): assert_equal( ImpalaDbms.get_nested_select('default', 'customers', 'id', None), ('id', '`default`.`customers`')) assert_equal( ImpalaDbms.get_nested_select('default', 'customers', 'email_preferences', 'categories/promos/'), ('email_preferences.categories.promos', '`default`.`customers`')) assert_equal( ImpalaDbms.get_nested_select('default', 'customers', 'addresses', 'key'), ('key', '`default`.`customers`.`addresses`')) assert_equal( ImpalaDbms.get_nested_select('default', 'customers', 'addresses', 'value/street_1/'), ('street_1', '`default`.`customers`.`addresses`')) assert_equal( ImpalaDbms.get_nested_select('default', 'customers', 'orders', 'item/order_date'), ('order_date', '`default`.`customers`.`orders`')) assert_equal( ImpalaDbms.get_nested_select('default', 'customers', 'orders', 'item/items/item/product_id'), ('product_id', '`default`.`customers`.`orders`.`items`'))
def get(user, query_server=None, cluster=None): global DBMS_CACHE global DBMS_CACHE_LOCK global RESET_HS2_QUERY_SERVER if query_server is None: query_server = get_query_server_config(connector=cluster) DBMS_CACHE_LOCK.acquire() try: DBMS_CACHE.setdefault(user.id, {}) if query_server['server_name'] not in DBMS_CACHE[user.id]: # Avoid circular dependency from beeswax.server.hive_server2_lib import HiveServerClientCompatible if query_server.get('dialect') == 'impala': from impala.dbms import ImpalaDbms from impala.server import ImpalaServerClient DBMS_CACHE[user.id][query_server['server_name']] = ImpalaDbms( HiveServerClientCompatible(ImpalaServerClient(query_server, user)), QueryHistory.SERVER_TYPE[1][0] ) elif query_server['server_name'] == 'hms': from beeswax.server.hive_metastore_server import HiveMetastoreClient DBMS_CACHE[user.id][query_server['server_name']] = HiveServer2Dbms( HiveMetastoreClient(query_server, user), QueryHistory.SERVER_TYPE[1][0] ) else: from beeswax.server.hive_server2_lib import HiveServerClient DBMS_CACHE[user.id][query_server['server_name']] = HiveServer2Dbms( HiveServerClientCompatible(HiveServerClient(query_server, user)), QueryHistory.SERVER_TYPE[1][0] ) elif RESET_HS2_QUERY_SERVER: from beeswax.server.hive_server2_lib import HiveServerClient, HiveServerClientCompatible RESET_HS2_QUERY_SERVER = False LOG.debug('Setting DBMS cache for the new hs2') DBMS_CACHE[user.id].clear() DBMS_CACHE[user.id][query_server['server_name']] = HiveServer2Dbms( HiveServerClientCompatible(HiveServerClient(query_server, user)), QueryHistory.SERVER_TYPE[1][0] ) return DBMS_CACHE[user.id][query_server['server_name']] finally: DBMS_CACHE_LOCK.release()
def test_get_impala_nested_select(self): assert_equal(ImpalaDbms.get_nested_select('default', 'customers', 'id', None), ('id', '`default`.`customers`')) assert_equal(ImpalaDbms.get_nested_select('default', 'customers', 'email_preferences', 'categories/promos/'), ('email_preferences.categories.promos', '`default`.`customers`')) assert_equal(ImpalaDbms.get_nested_select('default', 'customers', 'addresses', 'key'), ('key', '`default`.`customers`.`addresses`')) assert_equal(ImpalaDbms.get_nested_select('default', 'customers', 'addresses', 'value/street_1/'), ('street_1', '`default`.`customers`.`addresses`')) assert_equal(ImpalaDbms.get_nested_select('default', 'customers', 'orders', 'item/order_date'), ('order_date', '`default`.`customers`.`orders`')) assert_equal(ImpalaDbms.get_nested_select('default', 'customers', 'orders', 'item/items/item/product_id'), ('product_id', '`default`.`customers`.`orders`.`items`'))
def get_sample(self, database, table, column=None, nested=None, limit=100, generate_sql_only=False, operation=None): result = None hql = None # Filter on max # of partitions for partitioned tables column = '`%s`' % column if column else '*' if table.partition_keys: hql = self._get_sample_partition_query(database, table, column, limit, operation) elif self.server_name.startswith('impala'): if column or nested: from impala.dbms import ImpalaDbms select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) if operation == 'distinct': hql = 'SELECT DISTINCT %s FROM %s LIMIT %s;' % (select_clause, from_clause, limit) elif operation == 'max': hql = 'SELECT max(%s) FROM %s;' % (select_clause, from_clause) elif operation == 'min': hql = 'SELECT min(%s) FROM %s;' % (select_clause, from_clause) else: hql = 'SELECT %s FROM %s LIMIT %s;' % (select_clause, from_clause, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s;" % (database, table.name, limit) else: if operation == 'distinct': hql = "SELECT DISTINCT %s FROM `%s`.`%s` LIMIT %s;" % (column, database, table.name, limit) if operation == 'max': hql = "SELECT max(%s) FROM `%s`.`%s`;" % (column, database, table.name) if operation == 'min': hql = "SELECT min(%s) FROM `%s`.`%s`;" % (column, database, table.name) else: hql = "SELECT %s FROM `%s`.`%s` LIMIT %s;" % (column, database, table.name, limit) # TODO: Add nested select support for HS2 if hql: if generate_sql_only: return hql else: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def get_sample(self, database, table, column=None, nested=None): result = None hql = None if not table.is_view: limit = min(100, BROWSE_PARTITIONED_TABLE_LIMIT.get()) if column or nested: # Could do column for any type, then nested with partitions if self.server_name == 'impala': select_clause, from_clause = ImpalaDbms.get_nested_select( database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s' % (select_clause, from_clause, limit) else: partition_query = "" if table.partition_keys: partitions = self.get_partitions(database, table, partition_spec=None, max_parts=1) partition_query = 'WHERE ' + ' AND '.join([ "%s='%s'" % (table.partition_keys[idx].name, key) for idx, key in enumerate(partitions[0].values) ]) hql = "SELECT * FROM `%s`.`%s` %s LIMIT %s" % ( database, table.name, partition_query, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result