def create_table_as_a_select(self, request, query_history, target_database, target_table, result_meta): design = query_history.design.get_design() database = design.query["database"] # Case 1: Hive Server 2 backend or results straight from an existing table if result_meta.in_tablename: self.use(database) query = self._get_and_validate_select_query(design, query_history) hql = "CREATE TABLE %s.%s AS %s" % (target_database, target_table, query) query_history = self.execute_statement(hql) else: # Case 2: The results are in some temporary location # Beeswax backward compatibility and optimization # 1. Create table cols = "" schema = result_meta.schema for i, field in enumerate(schema.fieldSchemas): if i != 0: cols += ",\n" cols += "`%s` %s" % (field.name, field.type) # The representation of the delimiter is messy. # It came from Java as a string, which might has been converted from an integer. # So it could be "1" (^A), or "10" (\n), or "," (a comma literally). delim = result_meta.delim if not delim.isdigit(): delim = str(ord(delim)) hql = """ CREATE TABLE `%s` ( %s ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\%s' STORED AS TextFile """ % ( target_table, cols, delim.zfill(3), ) query = hql_query(hql) self.execute_and_wait(query) try: # 2. Move the results into the table's storage table_obj = self.get_table("default", target_table) table_loc = request.fs.urlsplit(table_obj.path_location)[2] result_dir = request.fs.urlsplit(result_meta.table_dir)[2] request.fs.rename_star(result_dir, table_loc) LOG.debug("Moved results from %s to %s" % (result_meta.table_dir, table_loc)) request.info(request, _("Saved query results as new table %(table)s.") % {"table": target_table}) query_history.save_state(QueryHistory.STATE.expired) except Exception, ex: query = hql_query("DROP TABLE `%s`" % target_table) try: self.execute_and_wait(query) except Exception, double_trouble: LOG.exception('Failed to drop table "%s" as well: %s' % (target_table, double_trouble)) raise ex
def init_example(request): """ Allow to make some test for the developpers, to see if the insertion and the querying of data is correct """ result = {'status': -1,'data': {}} query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) # Deleting the db hql = "DROP TABLE IF EXISTS val_test_2;" query = hql_query(hql) handle = db.execute_and_wait(query, timeout_sec=5.0) # Creating the db hql = "CREATE TABLE val_test_2 (id int, token string);" query = hql_query(hql) handle = db.execute_and_wait(query, timeout_sec=5.0) # Adding some data hql = " INSERT OVERWRITE val_test_2 values (1, 'a'), (2, 'b'), (-1,'xyzzy');" # hql = "INSERT INTO TABLE testset_bis VALUES (2, 25.0)" query = hql_query(hql) handle = db.execute_and_wait(query, timeout_sec=5.0) # querying the data hql = "SELECT * FROM val_test_2" query = hql_query(hql) handle = db.execute_and_wait(query, timeout_sec=5.0) if handle: data = db.fetch(handle, rows=100) result['data'] = list(data.rows()) db.close(handle) return render('database.initialize.mako', request, locals())
def _save_results_ctas(request, query_history, target_table, result_meta): """ Handle saving results as a new table. Returns HTTP response. May raise BeeswaxException, IOError. """ query_server = query_history.get_query_server() # Query server requires DDL support db = dbms.get(request.user) # Case 1: The results are straight from an existing table if result_meta.in_tablename: hql = 'CREATE TABLE `%s` AS SELECT * FROM %s' % (target_table, result_meta.in_tablename) query = hql_query(hql) # Display the CTAS running. Could take a long time. return execute_directly(request, query, query_server, on_success_url=urlresolvers.reverse(show_tables)) # Case 2: The results are in some temporary location # 1. Create table cols = '' schema = result_meta.schema for i, field in enumerate(schema.fieldSchemas): if i != 0: cols += ',\n' cols += '`%s` %s' % (field.name, field.type) # The representation of the delimiter is messy. # It came from Java as a string, which might has been converted from an integer. # So it could be "1" (^A), or "10" (\n), or "," (a comma literally). delim = result_meta.delim if not delim.isdigit(): delim = str(ord(delim)) hql = ''' CREATE TABLE `%s` ( %s ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\%s' STORED AS TextFile ''' % (target_table, cols, delim.zfill(3)) query = hql_query(hql) db.execute_and_wait(query) try: # 2. Move the results into the table's storage table_obj = db.get_table('default', target_table) table_loc = request.fs.urlsplit(table_obj.path_location)[2] request.fs.rename_star(result_meta.table_dir, table_loc) LOG.debug("Moved results from %s to %s" % (result_meta.table_dir, table_loc)) messages.info(request, _('Saved query results as new table %(table)s') % {'table': target_table}) query_history.save_state(models.QueryHistory.STATE.expired) except Exception, ex: LOG.error('Error moving data into storage of table %s. Will drop table.' % (target_table,)) query = hql_query('DROP TABLE `%s`' % (target_table,)) try: db.execute_directly(query) # Don't wait for results except Exception, double_trouble: LOG.exception('Failed to drop table "%s" as well: %s' % (target_table, double_trouble))
def create_table_as_a_select(self, request, query_history, target_table, result_meta): design = query_history.design.get_design() database = design.query['database'] # Case 1: Hive Server 2 backend or results straight from an existing table if result_meta.in_tablename: hql = 'CREATE TABLE `%s.%s` AS %s' % (database, target_table, design.query['query']) #query = hql_query(hql, database=database) query_history = self.execute_statement(hql) url = redirect(reverse('beeswax:watch_query', args=[query_history.id]) + '?on_success_url=' + reverse('metastore:describe_table', args=[database, target_table])) else: # Case 2: The results are in some temporary location # Beeswax backward compatibility and optimization # 1. Create table cols = '' schema = result_meta.schema for i, field in enumerate(schema.fieldSchemas): if i != 0: cols += ',\n' cols += '`%s` %s' % (field.name, field.type) # The representation of the delimiter is messy. # It came from Java as a string, which might has been converted from an integer. # So it could be "1" (^A), or "10" (\n), or "," (a comma literally). delim = result_meta.delim if not delim.isdigit(): delim = str(ord(delim)) hql = ''' CREATE TABLE `%s` ( %s ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\%s' STORED AS TextFile ''' % (target_table, cols, delim.zfill(3)) query = hql_query(hql) self.execute_and_wait(query) try: # 2. Move the results into the table's storage table_obj = self.get_table('default', target_table) table_loc = request.fs.urlsplit(table_obj.path_location)[2] result_dir = request.fs.urlsplit(result_meta.table_dir)[2] request.fs.rename_star(result_dir, table_loc) LOG.debug("Moved results from %s to %s" % (result_meta.table_dir, table_loc)) request.info(request, _('Saved query results as new table %(table)s.') % {'table': target_table}) query_history.save_state(QueryHistory.STATE.expired) except Exception, ex: query = hql_query('DROP TABLE `%s`' % target_table) try: self.execute_and_wait(query) except Exception, double_trouble: LOG.exception('Failed to drop table "%s" as well: %s' % (target_table, double_trouble)) raise ex
def config_validator(user): # dbms is dependent on beeswax.conf (this file) # import in method to avoid circular dependency from beeswax.design import hql_query from beeswax.server import dbms from beeswax.server.dbms import get_query_server_config res = [] try: try: if not 'test' in sys.argv: # Avoid tests hanging query_server = get_query_server_config(name='impala') server = dbms.get(user, query_server) query = hql_query("SELECT 'Hello World!';") handle = server.execute_and_wait(query, timeout_sec=10.0) if handle: server.fetch(handle, rows=100) server.close(handle) except StructuredThriftTransportException, ex: if 'TSocket read 0 bytes' in str(ex): # this message appears when authentication fails msg = "Failed to authenticate to Impalad, check authentication configurations." LOG.exception(msg) res.append((NICE_NAME, _(msg))) else: raise ex except Exception, ex: msg = "No available Impalad to send queries to." LOG.exception(msg) res.append((NICE_NAME, _(msg)))
def invalidate(self, database=None, flush_all=False): handle = None try: if flush_all or database is None: hql = "INVALIDATE METADATA" query = hql_query(hql, query_type=QUERY_TYPES[1]) handle = self.execute_and_wait(query, timeout_sec=10.0) else: diff_tables = self._get_different_tables(database) for table in diff_tables: hql = "INVALIDATE METADATA `%s`.`%s`" % (database, table) query = hql_query(hql, query_type=QUERY_TYPES[1]) handle = self.execute_and_wait(query, timeout_sec=10.0) except QueryServerTimeoutException, e: # Allow timeout exceptions to propagate raise e
def test_data_export(self): hql = 'SELECT * FROM test' query = hql_query(hql) # Get the result in xls. handle = self.db.execute_and_wait(query) xls_resp = download(handle, 'xls', self.db) translated_csv = xls2csv(xls_resp.content) # It should have 257 lines (256 + header) assert_equal(len(translated_csv.strip('\r\n').split('\r\n')), 257) # Get the result in csv. query = hql_query(hql) handle = self.db.execute_and_wait(query) csv_resp = download(handle, 'csv', self.db) assert_equal(csv_resp.content, translated_csv)
def get_sample(self, database, table, column=None, nested=None): result = None hql = None if not table.is_view: limit = min(100, BROWSE_PARTITIONED_TABLE_LIMIT.get()) if column or nested: # Could do column for any type, then nested with partitions if self.server_name == 'impala': select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s' % (select_clause, from_clause, limit) else: partition_query = "" if table.partition_keys: partitions = self.get_partitions(database, table, partition_spec=None, max_parts=1) partition_query = 'WHERE ' + ' AND '.join(["%s='%s'" % (table.partition_keys[idx].name, key) for idx, key in enumerate(partitions[0].values)]) hql = "SELECT * FROM `%s`.`%s` %s LIMIT %s" % (database, table.name, partition_query, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def create(self, django_user): """ Create table in the Hive Metastore. """ LOG.info('Creating table "%s"' % (self.name,)) db = dbms.get(django_user, self.query_server) try: # Already exists? if self.app_name == 'impala': db.invalidate_tables('default', [self.name]) db.get_table('default', self.name) msg = _('Table "%(table)s" already exists.') % {'table': self.name} LOG.error(msg) return False except Exception: query = hql_query(self.hql) try: results = db.execute_and_wait(query) if not results: msg = _('Error creating table %(table)s: Operation timeout.') % {'table': self.name} LOG.error(msg) raise InstallException(msg) return True except Exception, ex: msg = _('Error creating table %(table)s: %(error)s.') % {'table': self.name, 'error': ex} LOG.error(msg) raise InstallException(msg)
def load_after_create(request, database): """ Automatically load data into a newly created table. We get here from the create's on_success_url, and expect to find ``table`` and ``path`` from the parameters. """ tablename = request.REQUEST.get("table") path = request.REQUEST.get("path") is_remove_header = request.REQUEST.get("removeHeader").lower() == "on" and not path.endswith("gz") if not tablename or not path: msg = _("Internal error: Missing needed parameter to load data into table.") LOG.error(msg) raise PopupException(msg) if is_remove_header: remove_header(request.fs, path) LOG.debug("Auto loading data from %s into table %s" % (path, tablename)) hql = "LOAD DATA INPATH '%s' INTO TABLE `%s.%s`" % (path, database, tablename) query = hql_query(hql) on_success_url = reverse("metastore:describe_table", kwargs={"database": database, "table": tablename}) return execute_directly(request, query, on_success_url=on_success_url)
def alter_column(self, database, table_name, column_name, new_column_name, column_type, comment=None, partition_spec=None, cascade=False): hql = 'ALTER TABLE `%s`.`%s`' % (database, table_name) if partition_spec: hql += ' PARTITION (%s)' % partition_spec hql += ' CHANGE COLUMN `%s` `%s` %s' % (column_name, new_column_name, column_type.upper()) if comment: hql += " COMMENT '%s'" % comment if cascade: hql += ' CASCADE' timeout = SERVER_CONN_TIMEOUT.get() query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=timeout) if handle: self.close(handle) else: msg = _("Failed to execute alter column statement: %s") % hql raise QueryServerException(msg) return self.get_column(database, table_name, new_column_name)
def get_sample(self, database, table, column=None, nested=None, limit=100): result = None hql = None # Filter on max # of partitions for partitioned tables column = '`%s`' % column if column else '*' if table.partition_keys: hql = self._get_sample_partition_query(database, table, column, limit) elif self.server_name == 'impala': if column or nested: from impala.dbms import ImpalaDbms select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s;' % (select_clause, from_clause, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s;" % (database, table.name, limit) else: hql = "SELECT %s FROM `%s`.`%s` LIMIT %s;" % (column, database, table.name, limit) # TODO: Add nested select support for HS2 if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def test_document_create(self): sql = 'SELECT * FROM sample_07' design = hql_query(sql) # is_auto # is_trashed # is_redacted old_query = SavedQuery.objects.create( type=SavedQuery.TYPES_MAPPING['hql'], owner=self.user, data=design.dumps(), name='See examples', desc='Example of old format' ) try: new_query = import_saved_beeswax_query(old_query) new_query_data = new_query.get_data() assert_equal('query-hive', new_query_data['type']) assert_equal('See examples', new_query_data['name']) assert_equal('Example of old format', new_query_data['description']) assert_equal('ready', new_query_data['snippets'][0]['status']) assert_equal('See examples', new_query_data['snippets'][0]['name']) assert_equal('SELECT * FROM sample_07', new_query_data['snippets'][0]['statement_raw']) assert_equal([], new_query_data['snippets'][0]['properties']['settings']) assert_equal([], new_query_data['snippets'][0]['properties']['files']) assert_equal([], new_query_data['snippets'][0]['properties']['functions']) finally: old_query.delete()
def database_initialize(request): """ Install the tables for this application """ # The variant tables (impala and hbase) database_create_variants(request, temporary=False) # Connexion to the db query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) # The sql queries sql = "DROP TABLE IF EXISTS map_sample_id; CREATE TABLE map_sample_id (internal_sample_id STRING, customer_sample_id STRING, date_creation TIMESTAMP, date_modification TIMESTAMP); DROP TABLE IF EXISTS sample_files; CREATE TABLE sample_files (id STRING, internal_sample_id STRING, file_path STRING, file_type STRING, date_creation TIMESTAMP, date_modification TIMESTAMP) row format delimited fields terminated by ',' stored as textfile;" # The clinical db sql += "DROP TABLE IF EXISTS clinical_sample; CREATE TABLE clinical_sample (sample_id STRING, patient_id STRING, date_of_collection STRING, original_sample_id STRING, status STRING, sample_type STRING, biological_contamination STRING, storage_condition STRING, biobank_id STRING, pn_id STRING) row format delimited fields terminated by ',' stored as textfile;" # Executing the different queries tmp = sql.split(";") for hql in tmp: hql = hql.strip() if hql: query = hql_query(hql) handle = db.execute_and_wait(query, timeout_sec=5.0) return render('database.initialize.mako', request, locals())
def drop_database(request): db = dbms.get(request.user) if request.method == 'POST': databases = request.POST.getlist('database_selection') try: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) if request.POST.get('is_embeddable'): sql = db.drop_databases(databases, design, generate_ddl_only=True) job = make_notebook( name='Execute and watch', editor_type='hive', statement=sql.strip(), status='ready', database=None, on_success_url='assist.db.refresh', is_task=True ) return JsonResponse(job.execute(request)) else: query_history = db.drop_databases(databases, design) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + reverse('metastore:databases') return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(databases)s. Error: %(error)s") % {'databases': ','.join(databases), 'error': error_message} raise PopupException(error, title=_("Hive Error"), detail=log)
def sample_search(request): """ Search the data related to a given sample id """ result = {'status': -1,'data': {}} if request.method != 'POST' or not request.POST or not request.POST['sample_id']: result['status'] = 0 return HttpResponse(json.dumps(result), mimetype="application/json") sample_id = str(request.POST['sample_id']) # Database connexion query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) customer_sample_id = str(request.user.id)+"_"+sample_id # Selecting the files related to the sample id hql = "SELECT sample_files.id, sample_files.file_path FROM sample_files JOIN map_sample_id ON sample_files.internal_sample_id = map_sample_id.internal_sample_id WHERE map_sample_id.customer_sample_id = '"+customer_sample_id+"';" query = hql_query(hql) handle = db.execute_and_wait(query, timeout_sec=5.0) if handle: data = db.fetch(handle, rows=100) result['status'] = 1 result['data'] = list(data.rows()) db.close(handle) # Returning the data return HttpResponse(json.dumps(result), mimetype="application/json")
def execute(self, notebook, snippet): db = self._get_db(snippet) # Multiquery, if not first statement or arrived to the last query statement_id = snippet['result']['handle'].get('statement_id', 0) statements_count = snippet['result']['handle'].get('statements_count', 1) if snippet['result']['handle'].get('has_more_statements'): try: handle = self._get_handle(snippet) db.close_operation(handle) # Close all the time past multi queries except: LOG.warn('Could not close previous multiquery query') statement_id += 1 else: statement_id = 0 statements = self._get_statements(snippet['statement']) if statements_count != len(statements): statement_id = 0 statement = statements[statement_id] settings = snippet['properties'].get('settings', None) file_resources = snippet['properties'].get('files', None) functions = snippet['properties'].get('functions', None) database = snippet.get('database') or 'default' query = hql_query(statement, query_type=QUERY_TYPES[0], settings=settings, file_resources=file_resources, functions=functions, database=database) try: db.use(database) handle = db.client.query(query) except QueryServerException, ex: raise QueryError(ex.message)
def alter_table(self, database, table_name, new_table_name=None, comment=None, tblproperties=None): table_obj = self.get_table(database, table_name) if table_obj is None: raise PopupException(_("Failed to find the table: %s") % table_name) if table_obj.is_view: hql = 'ALTER VIEW `%s`.`%s`' % (database, table_name) else: hql = 'ALTER TABLE `%s`.`%s`' % (database, table_name) if new_table_name: table_name = new_table_name hql += ' RENAME TO `%s`' % table_name elif comment is not None: hql += " SET TBLPROPERTIES ('comment' = '%s')" % comment elif tblproperties: hql += " SET TBLPROPERTIES (%s)" % ' ,'.join("'%s' = '%s'" % (k, v) for k, v in tblproperties.items()) timeout = SERVER_CONN_TIMEOUT.get() query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=timeout) if handle: self.close(handle) else: msg = _("Failed to execute alter table statement: %s") % hql raise QueryServerException(msg) return self.client.get_table(database, table_name)
def get_table_columns_stats(self, database, table, column): if self.server_name.startswith('impala'): hql = 'SHOW COLUMN STATS `%(database)s`.`%(table)s`' % {'database': database, 'table': table} else: hql = 'DESCRIBE FORMATTED `%(database)s`.`%(table)s` `%(column)s`' % {'database': database, 'table': table, 'column': column} query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) data = list(result.rows()) if self.server_name.startswith('impala'): if column == -1: # All the columns return [self._extract_impala_column(col) for col in data] else: data = [col for col in data if col[0] == column][0] return self._extract_impala_column(data) else: return [ {'col_name': data[2][0]}, {'data_type': data[2][1]}, {'min': data[2][2]}, {'max': data[2][3]}, {'num_nulls': data[2][4]}, {'distinct_count': data[2][5]}, {'avg_col_len': data[2][6]}, {'max_col_len': data[2][7]}, {'num_trues': data[2][8]}, {'num_falses': data[2][9]} ] else: return []
def config_validator(user): # dbms is dependent on beeswax.conf (this file) # import in method to avoid circular dependency from beeswax.design import hql_query from beeswax.server import dbms res = [] try: try: if not 'test' in sys.argv: # Avoid tests hanging server = dbms.get(user) query = hql_query("SELECT 'Hello World!';") handle = server.execute_and_wait(query, timeout_sec=10.0) if handle: server.fetch(handle, rows=100) server.close(handle) except StructuredThriftTransportException, e: if 'Error validating the login' in str(e): msg = 'Failed to authenticate to HiveServer2, check authentication configurations.' LOG.exception(msg) res.append((NICE_NAME, _(msg))) else: raise e except Exception, e: msg = "The application won't work without a running HiveServer2." LOG.exception(msg) res.append((NICE_NAME, _(msg)))
def get_sample(self, database, table, column=None, nested=None): result = None hql = None limit = 100 if column or nested: # Could do column for any type, then nested with partitions if self.server_name == 'impala': from impala.dbms import ImpalaDbms select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s' % (select_clause, from_clause, limit) else: # Filter on max # of partitions for partitioned tables # Impala's SHOW PARTITIONS is different from Hive, so we only support Hive for now if self.server_name != 'impala' and table.partition_keys: hql = self._get_sample_partition_query(database, table, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s" % (database, table.name, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def _guess_range(user, dashboard, field): hql = "SELECT MIN(%(field)s), MAX(%(field)s) FROM %(database)s.%(table)s" % { 'field': field['name'], 'database': dashboard['properties'][0]['database'], 'table': dashboard['properties'][0]['table'] } query_server = get_query_server_config(name='impala') db = dbms.get(user, query_server=query_server) query = hql_query(hql) handle = db.execute_and_wait(query, timeout_sec=35.0) data = db.fetch(handle, rows=1) stats_min, stats_max = list(data.rows())[0] db.close(handle) _min, _m = _round_number_range(stats_min) _m, _max = _round_number_range(stats_max) properties = { 'min': stats_min, 'max': stats_max, 'start': _min, 'end': _max, #'gap': gap, 'canRange': True, 'isDate': False, } return properties
def _prepare_hql_query(self, snippet, statement, session): settings = snippet['properties'].get('settings', None) file_resources = snippet['properties'].get('files', None) functions = snippet['properties'].get('functions', None) properties = session['properties'] if session else [] # Get properties from session if not defined in snippet if not settings: settings = next((prop['value'] for prop in properties if prop['key'] == 'settings'), None) if not file_resources: file_resources = next((prop['value'] for prop in properties if prop['key'] == 'files'), None) if not functions: functions = next((prop['value'] for prop in properties if prop['key'] == 'functions'), None) database = snippet.get('database') or 'default' return hql_query( statement, query_type=QUERY_TYPES[0], settings=settings, file_resources=file_resources, functions=functions, database=database )
def put(self, request, original_variant, pk): # Allow to modify a variant in HBase/Impala f = open('/tmp/superhello.txt','w') f.write(json.dumps(request.data)) f.close() # We convert the original and modified data to flatjson fc = formatConverters(input_file='stuff.json',output_file='stuff.json') original_flatjson = fc.convertVariantJsonToFlatJson(json_data=original_variant) modified_flatjson = fc.convertVariantJsonToFlatJson(json_data=request.data) # We convert the data to hbase, and we modify directly some fields (note: the keys are almost the same for hbase and impala) hbase_data = fc.convertVariantFlatJsonToHbase(original_data=original_flatjson,modified_data=modified_flatjson) # Impala - We make the query query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) query = hql_query("INSERT INTO variant("+",".join(query_data)+")") handle = db.execute_and_wait(query, timeout_sec=5.0) if handle: db.close(handle) else: raise Exception("Impossible to create the variant...") # HBase - We add the data in that table too hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() rowkey = pk hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data)
def database_initialize(request): """ Install the tables for this application """ # Connexion to the db query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) # The sql queries sql = "DROP TABLE IF EXISTS map_sample_id; CREATE TABLE map_sample_id (internal_sample_id STRING, customer_sample_id STRING, date_creation TIMESTAMP, date_modification TIMESTAMP); DROP TABLE IF EXISTS sample_files; CREATE TABLE sample_files (id STRING, internal_sample_id STRING, file_path STRING, file_type STRING, date_creation TIMESTAMP, date_modification TIMESTAMP);" # The clinical db sql += "DROP TABLE IF EXISTS clinical_sample; CREATE TABLE clinical_sample (sample_id STRING, patient_id STRING, date_of_collection STRING, original_sample_id STRING, status STRING, sample_type STRING, biological_contamination STRING, storage_condition STRING, biobank_id STRING, pn_id STRING);" #DROP TABLE IF EXISTS variants; CREATE TABLE variants (id STRING, alternate_bases STRING, calls STRING, names STRING, info STRING, reference_bases STRING, quality DOUBLE, created TIMESTAMP, elem_start BIGINT, elem_end BIGINT, variantset_id STRING); DROP TABLE IF EXISTS variantsets; #CREATE TABLE variantsets (id STRING, dataset_id STRING, metadata STRING, reference_bounds STRING); #DROP TABLE IF EXISTS datasets; CREATE TABLE datasets (id STRING, is_public BOOLEAN, name STRING);''' # Executing the different queries tmp = sql.split(";") for hql in tmp: hql = hql.strip() if hql: query = hql_query(hql) handle = db.execute_and_wait(query, timeout_sec=5.0) return render('database.initialize.mako', request, locals())
def patient_search(request): """ Search the data related to a given patient id or dossier no """ result = {'status': -1,'data': {}} if request.method != 'POST' or not request.POST or not request.POST['sample_id']: result['status'] = 0 return HttpResponse(json.dumps(result), mimetype="application/json") hql = "SELECT patient.patient_id, patient.dossier_no, patient.patient_family_name, patient.patient_first_name, ethnicity.ethnicity FROM patient JOIN ethnicity ON patient.ethnic_id = ethnicity.ethnic_id WHERE " if "patient_id" in request.POST: patient_id = str(request.POST['patient_id']) hql += "patient.patient_id = '"+patient_id+"';" elif "dossier_no" in request.POST: dossier_no = str(request.POST['dossier_no']) hql += "patient.dossier_no = '"+dossier_no+"';" # Database connexion query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) customer_sample_id = str(request.user.id)+"_"+sample_id # Selecting the files related to the sample id query = hql_query(hql) handle = db.execute_and_wait(query, timeout_sec=5.0) if handle: data = db.fetch(handle, rows=100) result['status'] = 1 result['data'] = list(data.rows()) db.close(handle) # Returning the data return HttpResponse(json.dumps(result), mimetype="application/json")
def test_import_project(self): # Test that when importing a Document that is tagged with a project, we create a directory with that tag name and # place the document within it sql = 'SELECT * FROM sample_07' design = hql_query(sql) query = SavedQuery.objects.create( type=SavedQuery.TYPES_MAPPING['impala'], owner=self.user, data=design.dumps(), name='Impala query', desc='Test Impala query' ) doc = Document.objects.link(query, owner=query.owner, extra=query.type, name=query.name, description=query.desc) default_tag = DocumentTag.objects.get_default_tag(self.user) custom_tag = DocumentTag.objects.create_tag(self.user, 'sample_07') doc.add_tag(default_tag) doc.add_tag(custom_tag) try: converter = DocumentConverter(self.user) converter.convert() # Should have a directory named after custom tag assert_true(Directory.objects.filter(owner=self.user, name=custom_tag.tag, parent_directory=self.home_dir).exists()) # But ignore reserved tags (default) assert_false(Directory.objects.filter(owner=self.user, name=default_tag.tag, parent_directory=self.home_dir).exists()) # Document should exist under custom directory project_dir = Directory.objects.get(owner=self.user, name=custom_tag.tag, parent_directory=self.home_dir) assert_true(Document2.objects.filter(owner=self.user, name='Impala query', parent_directory=project_dir).exists()) finally: query.delete()
def test_convert_hive_query_with_invalid_name(self): sql = "SELECT * FROM sample_07" settings = [ {"key": "hive.exec.scratchdir", "value": "/tmp/mydir"}, {"key": "hive.querylog.location", "value": "/tmp/doc2"}, ] file_resources = [{"type": "jar", "path": "/tmp/doc2/test.jar"}] functions = [{"name": "myUpper", "class_name": "org.hue.udf.MyUpper"}] design = hql_query(sql, database="etl", settings=settings, file_resources=file_resources, functions=functions) query = SavedQuery.objects.create( type=SavedQuery.TYPES_MAPPING["hql"], owner=self.user, data=design.dumps(), name="Test / Hive query", desc="Test Hive query", ) doc = Document.objects.link(query, owner=query.owner, extra=query.type, name=query.name, description=query.desc) try: # Test that corresponding doc2 is created after convert assert_equal(0, Document2.objects.filter(owner=self.user, type="query-hive").count()) converter = DocumentConverter(self.user) converter.convert() assert_equal(1, Document2.objects.filter(owner=self.user, type="query-hive").count()) doc2 = Document2.objects.get(owner=self.user, type="query-hive", is_history=False) # Verify Document2 name is stripped of invalid chars assert_equal("Test Hive query", doc2.data_dict["name"]) finally: query.delete()
def test_document_create(self): sql = "SELECT * FROM sample_07" design = hql_query(sql) # is_auto # is_trashed # is_redacted old_query = SavedQuery.objects.create( type=SavedQuery.TYPES_MAPPING["hql"], owner=self.user, data=design.dumps(), name="See examples", desc="Example of old format", ) try: new_query = import_saved_beeswax_query(old_query) new_query_data = new_query.get_data() assert_equal("query-hive", new_query_data["type"]) assert_equal("See examples", new_query_data["name"]) assert_equal("Example of old format", new_query_data["description"]) assert_equal("ready", new_query_data["snippets"][0]["status"]) assert_equal("See examples", new_query_data["snippets"][0]["name"]) assert_equal("SELECT * FROM sample_07", new_query_data["snippets"][0]["statement_raw"]) assert_equal([], new_query_data["snippets"][0]["properties"]["settings"]) assert_equal([], new_query_data["snippets"][0]["properties"]["files"]) assert_equal([], new_query_data["snippets"][0]["properties"]["functions"]) finally: old_query.delete()
def get_sample(self, database, table, column=None, nested=None): result = None hql = None if not table.is_view: limit = min(100, BROWSE_PARTITIONED_TABLE_LIMIT.get()) if column or nested: # Could do column for any type, then nested with partitions if self.server_name == 'impala': select_clause, from_clause = ImpalaDbms.get_nested_select(database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s' % (select_clause, from_clause, limit) else: if table.partition_keys: # Filter on max # of partitions for partitioned tables hql = self._get_sample_partition_query(database, table, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s" % (database, table.name, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def get_table_stats(self, database, table): stats = [] if self.server_name == 'impala': hql = 'SHOW TABLE STATS `%(database)s`.`%(table)s`' % {'database': database, 'table': table} query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) stats = list(result.rows()) else: table = self.get_table(database, table) stats = table.stats return stats
def get_databases(self, database_names='*'): identifier = self.to_matching_wildcard(database_names) hql = "SHOW DATABASES LIKE '%s'" % (identifier) # self.client.get_databases() is too slow query = hql_query(hql) timeout = SERVER_CONN_TIMEOUT.get() handle = self.execute_and_wait(query, timeout_sec=timeout) if handle: result = self.fetch(handle, rows=5000) self.close(handle) databases = [name for database in result.rows() for name in database] if len(databases) <= APPLY_NATURAL_SORT_MAX.get(): databases = apply_natural_sort(databases) return databases else: return []
def _submit_create_and_load(request, create_hql, table_name, path, do_load, database): """ Submit the table creation, and setup the load to happen (if ``do_load``). """ on_success_params = {} app_name = get_app_name(request) if do_load: on_success_params['table'] = table_name on_success_params['path'] = path on_success_url = reverse(app_name + ':load_after_create', kwargs={'database': database}) else: on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': table_name}) query = hql_query(create_hql, database=database) return execute_directly(request, query, on_success_url=on_success_url, on_success_params=on_success_params)
def _submit_create_and_load(request, create_hql, table_name, path, load_data, database): """ Submit the table creation, and setup the load to happen (if ``load_data`` == IMPORT). """ on_success_params = QueryDict('', mutable=True) app_name = get_app_name(request) if load_data == 'IMPORT': on_success_params['table'] = table_name on_success_params['path'] = path on_success_url = reverse(app_name + ':load_after_create', kwargs={'database': database}) + '?' + on_success_params.urlencode() else: on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': table_name}) query = hql_query(create_hql, database=database) return execute_directly(request, query, on_success_url=on_success_url, on_success_params=on_success_params)
def drop_partition(request, database, table): source_type = request.POST.get('source_type', 'hive') cluster = json.loads(request.POST.get('cluster', '{}')) db = _get_db(user=request.user, source_type=source_type, cluster=cluster) if request.method == 'POST': partition_specs = request.POST.getlist('partition_selection') partition_specs = [spec for spec in partition_specs] try: if request.GET.get("format", "html") == "json": last_executed = json.loads(request.POST.get('start_time'), '-1') sql = db.drop_partitions(database, table, partition_specs, design=None, generate_ddl_only=True) job = make_notebook(name=_('Drop partition %s') % ', '.join(partition_specs)[:100], editor_type=source_type, statement=sql.strip(), status='ready', database=None, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed) return JsonResponse(job.execute(request)) else: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_partitions(database, table, partition_specs, design) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + \ reverse('metastore:describe_partitions', kwargs={'database': database, 'table': table}) return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(partition)s. Error: %(error)s") % { 'partition': '\n'.join(partition_specs), 'error': error_message } raise PopupException(error, title=_("DB Error"), detail=log)
def _submit_create_and_load(request, create_hql, table_name, path, do_load): """ Submit the table creation, and setup the load to happen (if ``do_load``). """ on_success_params = {} if do_load: on_success_params['table'] = table_name on_success_params['path'] = path on_success_url = urlresolvers.reverse(load_after_create) else: on_success_url = urlresolvers.reverse(describe_table, kwargs={'table': table_name}) query = hql_query(create_hql) return execute_directly(request, query, on_success_url=on_success_url, on_success_params=on_success_params)
def drop_tables(self, database, tables, design): hql = [] for table in tables: if table.is_view: hql.append("DROP VIEW `%s.%s`" % ( database, table.name, )) else: hql.append("DROP TABLE `%s.%s`" % ( database, table.name, )) query = hql_query(';'.join(hql), database) design.data = query.dumps() design.save() return self.execute_query(query, design)
def get_top_terms(self, database, table, column, limit=30, prefix=None): limit = min(limit, 100) prefix_match = '' if prefix: prefix_match = "WHERE CAST(%(column)s AS STRING) LIKE '%(prefix)s%%'" % {'column': column, 'prefix': prefix} hql = 'SELECT %(column)s, COUNT(*) AS ct FROM `%(database)s`.`%(table)s` %(prefix_match)s GROUP BY %(column)s ORDER BY ct DESC LIMIT %(limit)s' % { 'database': database, 'table': table, 'column': column, 'prefix_match': prefix_match, 'limit': limit, } query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=60.0) # Hive is very slow if handle: result = self.fetch(handle, rows=limit) self.close(handle) return list(result.rows()) else: return []
def get_sample(self, database, table, column=None, nested=None, limit=100, generate_sql_only=False): result = None hql = None # Filter on max # of partitions for partitioned tables column = '`%s`' % column if column else '*' if table.partition_keys: hql = self._get_sample_partition_query(database, table, column, limit) elif self.server_name == 'impala': if column or nested: from impala.dbms import ImpalaDbms select_clause, from_clause = ImpalaDbms.get_nested_select( database, table.name, column, nested) hql = 'SELECT %s FROM %s LIMIT %s;' % (select_clause, from_clause, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s;" % (database, table.name, limit) else: hql = "SELECT %s FROM `%s`.`%s` LIMIT %s;" % (column, database, table.name, limit) # TODO: Add nested select support for HS2 if hql: if generate_sql_only: return hql else: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def test_import_permissions(self): make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False) other_user = User.objects.get(username="******") test_group = get_default_user_group() # Test that when importing a Document with permissions, the corresponding permissions are created for the Doc2 sql = 'SELECT * FROM sample_07' design = hql_query(sql) query = SavedQuery.objects.create( type=SavedQuery.TYPES_MAPPING['impala'], owner=self.user, data=design.dumps(), name='Impala query', desc='Test Impala query') doc = Document.objects.link(query, owner=query.owner, extra=query.type, name=query.name, description=query.desc) read_perm = DocumentPermission.objects.create(doc=doc, perms='read') read_perm.users.add(other_user) read_perm.groups.add(test_group) write_perm = DocumentPermission.objects.create(doc=doc, perms='write') write_perm.users.add(other_user) try: converter = DocumentConverter(self.user) converter.convert() doc2 = Document2.objects.get(owner=self.user, name=query.name) # Test that doc2 has same read permissions assert_true(other_user in doc2.get_permission('read').users.all()) assert_true(test_group in doc2.get_permission('read').groups.all()) # Test that doc2 has same write permissions assert_true(other_user in doc2.get_permission('write').users.all()) finally: query.delete()
def drop_database(request): db = dbms.get(request.user) if request.method == 'POST': databases = request.POST.getlist('database_selection') try: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) if request.POST.get('is_embeddable'): last_executed = json.loads(request.POST.get('start_time'), '-1') sql = db.drop_databases(databases, design, generate_ddl_only=True) job = make_notebook(name=_('Drop database %s') % ', '.join(databases)[:100], editor_type='hive', statement=sql.strip(), status='ready', database=None, on_success_url='assist.db.refresh', is_task=True, last_executed=last_executed) return JsonResponse(job.execute(request)) else: query_history = db.drop_databases(databases, design) url = reverse( 'beeswax:watch_query_history', kwargs={ 'query_history_id': query_history.id }) + '?on_success_url=' + reverse('metastore:databases') return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(databases)s. Error: %(error)s") % { 'databases': ','.join(databases), 'error': error_message } raise PopupException(error, title=_("DB Error"), detail=log)
def config_validator(user): # dbms is dependent on beeswax.conf (this file) # import in method to avoid circular dependency from beeswax.design import hql_query from beeswax.server import dbms res = [] try: try: if not 'test' in sys.argv: # Avoid tests hanging server = dbms.get(user) query = hql_query("SELECT 'Hello World!';") handle = server.execute_and_wait(query, timeout_sec=10.0) if handle: server.fetch(handle, rows=100) server.close(handle) except StructuredThriftTransportException as e: if 'Error validating the login' in str(e): msg = 'Failed to authenticate to HiveServer2, check authentication configurations.' LOG.exception(msg) res.append((NICE_NAME, _(msg))) else: raise e except Exception as e: msg = "The application won't work without a running HiveServer2." LOG.exception(msg) res.append((NICE_NAME, _(msg))) try: from desktop.lib.fsmanager import get_filesystem warehouse = beeswax.hive_site.get_metastore_warehouse_dir() fs = get_filesystem() if fs: fs.do_as_superuser(fs.stats, warehouse) except Exception: msg = 'Failed to access Hive warehouse: %s' LOG.exception(msg % warehouse) return [(NICE_NAME, _(msg) % warehouse)] return res
def load_data(self, database, table, form, design): hql = "LOAD DATA INPATH" hql += " '%s'" % form.cleaned_data['path'] if form.cleaned_data['overwrite']: hql += " OVERWRITE" hql += " INTO TABLE " hql += "`%s.%s`" % (database, table.name,) if form.partition_columns: hql += " PARTITION (" vals = [] for key, column_name in form.partition_columns.iteritems(): vals.append("%s='%s'" % (column_name, form.cleaned_data[key])) hql += ", ".join(vals) hql += ")" query = hql_query(hql, database) design.data = query.dumps() design.save() return self.execute_query(query, design)
def drop_tables(self, database, tables, design, skip_trash=False, generate_ddl_only=False): hql = [] for table in tables: if table.is_view: hql.append("DROP VIEW `%s`.`%s`" % (database, table.name,)) else: drop_query = "DROP TABLE `%s`.`%s`" % (database, table.name,) drop_query += skip_trash and " PURGE" or "" hql.append(drop_query) query = hql_query(';'.join(hql), database) if generate_ddl_only: return query.hql_query else: design.data = query.dumps() design.save() return self.execute_query(query, design)
def get_sample(self, database, table): """No samples if it's a view (HUE-526)""" if not table.is_view: limit = min(100, BROWSE_PARTITIONED_TABLE_LIMIT.get()) partition_query = "" if table.partition_keys: partitions = self.get_partitions(database, table, 1) partition_query = 'WHERE ' + ' AND '.join([ "%s='%s'" % (table.partition_keys[idx].name, key) for idx, key in enumerate(partitions[0].values) ]) hql = "SELECT * FROM `%s.%s` %s LIMIT %s" % ( database, table.name, partition_query, limit) query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def load(self, django_user): """ Upload data to HDFS home of user then load (aka move) it into the Hive table (in the Hive metastore in HDFS). """ LOAD_HQL = \ """ LOAD DATA INPATH '%(filename)s' OVERWRITE INTO TABLE %(tablename)s """ fs = cluster.get_hdfs() if self.app_name == 'impala': # Because Impala does not have impersonation on by default, we use a public destination for the upload. from impala.conf import IMPERSONATION_ENABLED if not IMPERSONATION_ENABLED.get(): tmp_public = '/tmp/public_hue_examples' fs.do_as_user(django_user, fs.mkdir, tmp_public, '0777') hdfs_root_destination = tmp_public else: hdfs_root_destination = fs.do_as_user(django_user, fs.get_home_dir) hdfs_destination = os.path.join(hdfs_root_destination, self.name) LOG.info('Uploading local data %s to HDFS table "%s"' % (self.name, hdfs_destination)) fs.do_as_user(django_user, fs.copyFromLocal, self._contents_file, hdfs_destination) LOG.info('Loading data into table "%s"' % (self.name,)) hql = LOAD_HQL % {'tablename': self.name, 'filename': hdfs_destination} query = hql_query(hql) try: results = dbms.get(django_user, self.query_server).execute_and_wait(query) if not results: msg = _('Error loading table %(table)s: Operation timeout.') % {'table': self.name} LOG.error(msg) raise InstallException(msg) except QueryServerException, ex: msg = _('Error loading table %(table)s: %(error)s.') % {'table': self.name, 'error': ex} LOG.error(msg) raise InstallException(msg)
def load_data(self, database, table, form_data, design, generate_ddl_only=False): hql = "LOAD DATA INPATH" hql += " '%(path)s'" % form_data if form_data['overwrite']: hql += " OVERWRITE" hql += " INTO TABLE " hql += "`%s`.`%s`" % (database, table) if form_data['partition_columns']: hql += " PARTITION (" vals = ["%s='%s'" % (column_name, column_value) for column_name, column_value in form_data['partition_columns']] hql += ", ".join(vals) hql += ")" if generate_ddl_only: return hql else: query = hql_query(hql, database) design.data = query.dumps() design.save() return self.execute_query(query, design)
def _load_data_to_table(self, django_user, hql, hdfs_destination): LOG.info('Loading data into table "%s"' % (self.name, )) query = hql_query(hql) try: results = dbms.get(django_user, self.query_server).execute_and_wait(query) if not results: msg = _( 'Error loading table %(table)s: Operation timeout.') % { 'table': self.name } LOG.error(msg) raise InstallException(msg) except QueryServerException as ex: msg = _('Error loading table %(table)s: %(error)s.') % { 'table': self.name, 'error': ex } LOG.error(msg) raise InstallException(msg)
def get_sample(self, database, table, column=None, nested=None): result = None hql = None if not table.is_view: limit = min(100, BROWSE_PARTITIONED_TABLE_LIMIT.get()) if table.partition_keys: # Filter on max # of partitions for partitioned tables hql = self._get_sample_partition_query(database, table, limit) else: hql = "SELECT * FROM `%s`.`%s` LIMIT %s" % (database, table.name, limit) if hql: query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) return result
def create_database(request): if request.method == "POST": data = request.POST.copy() data.setdefault("use_default_location", False) form = CreateDatabaseForm(data) if form.is_valid(): proposed_query = django_mako.render_to_string("create_database_statement.mako", { 'database': form.cleaned_data, }) # Mako outputs bytestring in utf8 proposed_query = proposed_query.decode('utf-8') query = hql_query(proposed_query) return execute_directly(request, query, on_success_url=reverse('metastore:databases')) else: form = CreateDatabaseForm() return render("create_database.mako", request, { 'database_form': form, })
def get_histogram(self, database, table, column, nested=None): """ Returns the results of an Impala SELECT histogram() FROM query for a given column or nested type. Assumes that the column/nested type is scalar. """ results = [] hql = self.get_histogram_query(database, table, column, nested) query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle) try: histogram = list(result.rows())[0][0] # actual histogram results is in first-and-only result row unique_values = set(histogram.split(', ')) results = list(unique_values) except IndexError, e: LOG.warn('Failed to get histogram results, result set has unexpected format: %s' % smart_str(e)) finally:
def alter_table(self, database, table_name, new_table_name=None, comment=None, tblproperties=None): hql = 'ALTER TABLE `%s`.`%s`' % (database, table_name) if new_table_name: table_name = new_table_name hql += ' RENAME TO `%s`' % table_name elif comment: hql += " SET TBLPROPERTIES ('comment' = '%s')" % comment elif tblproperties: hql += " SET TBLPROPERTIES (%s)" % ' ,'.join("'%s' = '%s'" % (k, v) for k, v in tblproperties.items()) timeout = SERVER_CONN_TIMEOUT.get() query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=timeout) if handle: self.close(handle) else: msg = _("Failed to execute alter table statement: %s") % hql raise QueryServerException(msg) return self.client.get_table(database, table_name)
def load_after_create(request, database): """ Automatically load data into a newly created table. We get here from the create's on_success_url, and expect to find ``table`` and ``path`` from the parameters. """ tablename = request.REQUEST.get('table') path = request.REQUEST.get('path') if not tablename or not path: msg = _('Internal error: Missing needed parameter to load data into table.') LOG.error(msg) raise PopupException(msg) LOG.debug("Auto loading data from %s into table %s" % (path, tablename)) hql = "LOAD DATA INPATH '%s' INTO TABLE `%s.%s`" % (path, database, tablename) query = hql_query(hql) app_name = get_app_name(request) on_success_url = reverse('metastore:describe_table', kwargs={'database': database, 'table': tablename}) return execute_directly(request, query, on_success_url=on_success_url)
def drop_partition(request, database, table): db = dbms.get(request.user) if request.method == 'POST': partition_specs = request.POST.getlist('partition_selection') partition_specs = [spec for spec in partition_specs] try: design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_partitions(database, table, partition_specs, design) url = reverse('beeswax:watch_query_history', kwargs={'query_history_id': query_history.id}) + '?on_success_url=' + \ reverse('metastore:describe_partitions', kwargs={'database': database, 'table': table}) return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(partition)s. Error: %(error)s") % { 'partition': '\n'.join(partition_specs), 'error': error_message } raise PopupException(error, title=_("Hive Error"), detail=log)
def get_table_columns_stats(self, database, table, column): if self.server_name == 'impala': hql = 'SHOW COLUMN STATS `%(database)s`.`%(table)s`' % {'database': database, 'table': table} else: hql = 'DESCRIBE FORMATTED `%(database)s`.`%(table)s` `%(column)s`' % {'database': database, 'table': table, 'column': column} query = hql_query(hql) handle = self.execute_and_wait(query, timeout_sec=5.0) if handle: result = self.fetch(handle, rows=100) self.close(handle) data = list(result.rows()) if self.server_name == 'impala': data = [col for col in data if col[0] == column][0] return [ {'col_name': data[0]}, {'data_type': data[1]}, {'distinct_count': data[2]}, {'num_nulls': data[3]}, {'max_col_len': data[4]}, {'avg_col_len': data[5]}, ] else: return [ {'col_name': data[2][0]}, {'data_type': data[2][1]}, {'min': data[2][2]}, {'max': data[2][3]}, {'num_nulls': data[2][4]}, {'distinct_count': data[2][5]}, {'avg_col_len': data[2][6]}, {'max_col_len': data[2][7]}, {'num_trues': data[2][8]}, {'num_falses': data[2][9]} ] else: return []
def test_split_statements(): assert_equal([''], hql_query(";;;").statements) assert_equal(["select * where id == '10'"], hql_query("select * where id == '10'").statements) assert_equal(["select * where id == '10'"], hql_query("select * where id == '10';").statements) assert_equal( ['select', "select * where id == '10;' limit 100"], hql_query("select; select * where id == '10;' limit 100;").statements) assert_equal( ['select', "select * where id == \"10;\" limit 100"], hql_query( "select; select * where id == \"10;\" limit 100;").statements) assert_equal( ['select', "select * where id == '\"10;\"\"\"' limit 100"], hql_query("select; select * where id == '\"10;\"\"\"' limit 100;" ).statements)
def load(self, django_user): """ Load data into table. Raise InstallException on failure. """ LOAD_HQL = \ """ LOAD DATA local INPATH '%(filename)s' OVERWRITE INTO TABLE %(tablename)s """ LOG.info('Loading data into table "%s"' % (self.name,)) hql = LOAD_HQL % dict(tablename=self.name, filename=self._contents_file) query = hql_query(hql) try: results = dbms.get(django_user).execute_and_wait(query) if not results: msg = _('Error loading table %(table)s: Operation timeout.') % {'table': self.name} LOG.error(msg) raise InstallException(msg) except BeeswaxException, ex: msg = _('Error loading table %(table)s: %(error)s') % {'table': self.name, 'error': ex} LOG.error(msg) raise InstallException(msg)
def drop_database(request): db = dbms.get(request.user) if request.method == 'POST': databases = request.POST.getlist('database_selection') try: # Can't be simpler without an important refactoring design = SavedQuery.create_empty(app_name='beeswax', owner=request.user, data=hql_query('').dumps()) query_history = db.drop_databases(databases, design) url = reverse('beeswax:watch_query', args=[ query_history.id ]) + '?on_success_url=' + reverse('metastore:databases') return redirect(url) except Exception, ex: error_message, log = dbms.expand_exception(ex, db) error = _("Failed to remove %(databases)s. Error: %(error)s") % { 'databases': ','.join(databases), 'error': error_message } raise PopupException(error, title=_("Hive Error"), detail=log)
def get_tables(self, database='default', table_names='*'): identifier = self.to_matching_wildcard(table_names) identifier = "'%s'" % identifier if identifier != '*' else '' # Filter not supported in SparkSql hql = "SHOW TABLES IN `%s` %s" % ( database, identifier ) # self.client.get_tables(database, table_names) is too slow query = hql_query(hql) timeout = SERVER_CONN_TIMEOUT.get() handle = self.execute_and_wait(query, timeout_sec=timeout) if handle: result = self.fetch(handle, rows=5000) self.close(handle) tables = [ table[0] for table in result.rows() ] # We only keep the first column as the name, SparkSql returns multiple columns if len(tables) <= APPLY_NATURAL_SORT_MAX.get(): tables = apply_natural_sort(tables) return tables else: return []
def drop_partitions(self, db_name, table_name, partition_specs, design=None, generate_ddl_only=False): hql = [] for partition_spec in partition_specs: hql.append( "ALTER TABLE `%s`.`%s` DROP IF EXISTS PARTITION (%s) PURGE" % (db_name, table_name, partition_spec)) hql = ';'.join(hql) query = hql_query(hql, db_name) if generate_ddl_only: return hql else: design.data = query.dumps() design.save() return self.execute_query(query, design)