def execute(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: response['handle'] = get_api(request, snippet).execute(notebook, snippet) finally: if notebook['type'].startswith('query-'): _snippet = [s for s in notebook['snippets'] if s['id'] == snippet['id']][0] if 'handle' in response: # No failure _snippet['result']['handle'] = response['handle'] _snippet['result']['statements_count'] = response['handle']['statements_count'] else: _snippet['status'] = 'failed' history = _historify(notebook, request.user) response['history_id'] = history.id response['history_uuid'] = history.uuid # Materialize and HTML escape results if response['handle'].get('sync') and response['handle']['result'].get('data'): response['handle']['result']['data'] = escape_rows(response['handle']['result']['data']) response['status'] = 0 return JsonResponse(response)
def execute(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: response['handle'] = get_api(request, snippet).execute(notebook, snippet) finally: if notebook['type'].startswith('query-'): _snippet = [s for s in notebook['snippets'] if s['id'] == snippet['id']][0] if 'handle' in response: # No failure _snippet['result']['handle'] = response['handle'] else: _snippet['status'] = 'failed' history = _historify(notebook, request.user) response['history_id'] = history.id response['history_uuid'] = history.uuid if notebook['isSaved']: # Keep track of history of saved queries response['history_parent_uuid'] = history.dependencies.filter(type__startswith='query-').latest('last_modified').uuid # Materialize and HTML escape results if response['handle'].get('sync') and response['handle']['result'].get('data'): response['handle']['result']['data'] = escape_rows(response['handle']['result']['data']) response['status'] = 0 return JsonResponse(response)
def view_results(request, id, first_row=0): """ Returns the view for the results of the QueryHistory with the given id. The query results MUST be ready. To display query results, one should always go through the execute_query view. If the result set has has_result_set=False, display an empty result. If ``first_row`` is 0, restarts (if necessary) the query read. Otherwise, just spits out a warning if first_row doesn't match the servers conception. Multiple readers will produce a confusing interaction here, and that's known. It understands the ``context`` GET parameter. (See execute_query().) """ first_row = long(first_row) start_over = (first_row == 0) results = type('Result', (object,), { 'rows': 0, 'columns': [], 'has_more': False, 'start_row': 0, }) data = [] fetch_error = False error_message = '' log = '' columns = [] app_name = get_app_name(request) query_history = authorized_get_query_history(request, id, must_exist=True) query_server = query_history.get_query_server_config() db = dbms.get(request.user, query_server) handle, state = _get_query_handle_and_state(query_history) context_param = request.GET.get('context', '') query_context = parse_query_context(context_param) # Update the status as expired should not be accessible expired = state == models.QueryHistory.STATE.expired # Retrieve query results or use empty result if no result set try: if query_server['server_name'] == 'impala' and not handle.has_result_set: downloadable = False else: results = db.fetch(handle, start_over, 100) # Materialize and HTML escape results data = escape_rows(results.rows()) # We display the "Download" button only when we know that there are results: downloadable = first_row > 0 or data log = db.get_log(handle) columns = results.data_table.cols() except Exception, ex: LOG.exception('error fetching results') fetch_error = True error_message, log = expand_exception(ex, db, handle)
def _get_sample_data(db, database, table): table_obj = db.get_table(database, table) sample_data = db.get_sample(database, table_obj) response = {'status': -1} if sample_data: response['status'] = 0 response['headers'] = sample_data.cols() response['rows'] = escape_rows(sample_data.rows(), nulls_only=True) else: response['message'] = _('Failed to get sample data.') return response
def get_indexes(request, database, table): query_server = dbms.get_query_server_config(get_app_name(request)) db = dbms.get(request.user, query_server) response = {'status': -1} indexes = db.get_indexes(database, table) if indexes: response['status'] = 0 response['headers'] = indexes.cols() response['rows'] = escape_rows(indexes.rows(), nulls_only=True) else: response['message'] = _('Failed to get indexes.') return JsonResponse(response)
def get_indexes(request, database, table): query_server = dbms.get_query_server_config(get_app_name(request)) db = dbms.get(request.user, query_server) response = {'status': -1, 'error_message': ''} indexes = db.get_indexes(database, table) if indexes: response['status'] = 0 response['headers'] = indexes.cols() response['rows'] = escape_rows(indexes.rows(), nulls_only=True) else: response['error_message'] = _('Index data took too long to be generated') return JsonResponse(response)
def execute(request): response = {"status": -1} notebook = json.loads(request.POST.get("notebook", "{}")) snippet = json.loads(request.POST.get("snippet", "{}")) response["handle"] = get_api(request.user, snippet, request.fs, request.jt).execute(notebook, snippet) # Materialize and HTML escape results if response["handle"].get("sync") and response["handle"]["result"].get("data"): response["handle"]["result"]["data"] = escape_rows(response["handle"]["result"]["data"]) response["status"] = 0 return JsonResponse(response)
def execute(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) response['handle'] = get_api(request, snippet).execute(notebook, snippet) # Materialize and HTML escape results if response['handle'].get('sync') and response['handle']['result'].get('data'): response['handle']['result']['data'] = escape_rows(response['handle']['result']['data']) response['status'] = 0 return JsonResponse(response)
def get_sample_data(request, database, table): query_server = dbms.get_query_server_config(get_app_name(request)) db = dbms.get(request.user, query_server) response = {'status': -1} table_obj = db.get_table(database, table) sample_data = db.get_sample(database, table_obj) if sample_data: response['status'] = 0 response['headers'] = sample_data.cols() response['rows'] = escape_rows(sample_data.rows(), nulls_only=True) else: response['message'] = _('Failed to get sample data.') return JsonResponse(response)
def get_functions(request): query_server = dbms.get_query_server_config(get_app_name(request)) db = dbms.get(request.user, query_server) response = {'status': -1} prefix = request.GET.get('prefix', None) functions = db.get_functions(prefix) if functions: response['status'] = 0 rows = escape_rows(functions.rows(), nulls_only=True) response['functions'] = [row[0] for row in rows] else: response['message'] = _('Failed to get functions.') return JsonResponse(response)
def get_sample_data(request, database, table): query_server = dbms.get_query_server_config(get_app_name(request)) db = dbms.get(request.user, query_server) response = {'status': -1, 'error_message': ''} table_obj = db.get_table(database, table) sample_data = db.get_sample(database, table_obj) if sample_data: response['status'] = 0 response['headers'] = sample_data.cols() response['rows'] = escape_rows(sample_data.rows(), nulls_only=True) else: response['error_message'] = _('Sample data took too long to be generated') return JsonResponse(response)
def get_sample_data(request, database, table): db = dbms.get(request.user) response = {'status': -1, 'error_message': ''} try: table_obj = db.get_table(database, table) sample_data = db.get_sample(database, table_obj) if sample_data: response['status'] = 0 response['headers'] = sample_data.cols() response['rows'] = escape_rows(sample_data.rows(), nulls_only=True) else: response['error_message'] = _('Sample data took too long to be generated') except Exception, ex: error_message, logs = dbms.expand_exception(ex, db) response['error_message'] = error_message
def get_indexes(request, database, table): query_server = dbms.get_query_server_config(get_app_name(request)) db = dbms.get(request.user, query_server) response = {'status': -1, 'error_message': ''} try: indexes = db.get_indexes(database, table) if indexes: response['status'] = 0 response['headers'] = indexes.cols() response['rows'] = escape_rows(indexes.rows(), nulls_only=True) else: response['error_message'] = _('Index data took too long to be generated') except Exception, ex: error_message, logs = dbms.expand_exception(ex, db) response['error_message'] = error_message
def fetch_result_data(request): response = {"status": -1} notebook = json.loads(request.POST.get("notebook", "{}")) snippet = json.loads(request.POST.get("snippet", "{}")) rows = json.loads(request.POST.get("rows", 100)) start_over = json.loads(request.POST.get("startOver", False)) response["result"] = get_api(request, snippet).fetch_result(notebook, snippet, rows, start_over) # Materialize and HTML escape results if response["result"].get("data") and response["result"].get("type") == "table": response["result"]["data"] = escape_rows(response["result"]["data"]) response["status"] = 0 return JsonResponse(response)
def fetch_result_data(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) rows = json.loads(request.POST.get('rows', 100)) start_over = json.loads(request.POST.get('startOver', False)) response['result'] = get_api(request, snippet).fetch_result(notebook, snippet, rows, start_over) # Materialize and HTML escape results if response['result'].get('data') and response['result'].get('type') == 'table': response['result']['data'] = escape_rows(response['result']['data']) response['status'] = 0 return JsonResponse(response)
def _get_sample_data(db, database, table, column): table_obj = db.get_table(database, table) sample_data = db.get_sample(database, table_obj, column) response = {'status': -1} if sample_data: sample = escape_rows(sample_data.rows(), nulls_only=True) if column: sample = set([row[0] for row in sample]) sample = [[item] for item in sorted(list(sample))] response['status'] = 0 response['headers'] = sample_data.cols() response['rows'] = sample else: response['message'] = _('Failed to get sample data.') return response
def get_sample_data(self, database, table, column=None): if column is None: column = ', '.join([col['name'] for col in self.get_columns(database, table)]) snippet = { 'database': table, 'statement': 'SELECT %s FROM %s LIMIT 250' % (column, table) } res = self.api.execute(None, snippet) response = {'status': -1} if res: response['status'] = 0 response['headers'] = [col['name'] for col in res['result']['meta']] response['rows'] = escape_rows(res['result']['data'], nulls_only=True) else: response['message'] = _('Failed to get sample data.') return response
def execute(request): response = {'status': -1} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: response['handle'] = get_api(request, snippet).execute(notebook, snippet) finally: if notebook['type'].startswith('query-'): history = _historify(notebook, request.user) response['history_id'] = history.id # Materialize and HTML escape results if response['handle'].get('sync') and response['handle']['result'].get('data'): response['handle']['result']['data'] = escape_rows(response['handle']['result']['data']) response['status'] = 0 return JsonResponse(response)
def _get_sample_data(db, database, table, column): table_obj = db.get_table(database, table) if table_obj.is_impala_only and db.client.query_server['server_name'] != 'impala': query_server = get_query_server_config('impala') db = dbms.get(db.client.user, query_server) sample_data = db.get_sample(database, table_obj, column) response = {'status': -1} if sample_data: sample = escape_rows(sample_data.rows(), nulls_only=True) if column: sample = set([row[0] for row in sample]) sample = [[item] for item in sorted(list(sample))] response['status'] = 0 response['headers'] = sample_data.cols() response['full_headers'] = sample_data.full_cols() response['rows'] = sample else: response['message'] = _('Failed to get sample data.') return response
def execute(request): response = {'status': -1} result = None notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) try: response['handle'] = get_api(request, snippet).execute(notebook, snippet) # Retrieve and remove the result from the handle if response['handle'].get('sync'): result = response['handle'].pop('result') finally: if notebook['type'].startswith('query-'): _snippet = [s for s in notebook['snippets'] if s['id'] == snippet['id']][0] if 'handle' in response: # No failure _snippet['result']['handle'] = response['handle'] _snippet['result']['statements_count'] = response['handle'].get('statements_count', 1) _snippet['result']['statement_id'] = response['handle'].get('statement_id', 0) _snippet['result']['handle']['statement'] = response['handle'].get('statement', snippet['statement']) # For non HS2, as non multi query yet else: _snippet['status'] = 'failed' history = _historify(notebook, request.user) response['history_id'] = history.id response['history_uuid'] = history.uuid if notebook['isSaved']: # Keep track of history of saved queries response['history_parent_uuid'] = history.dependencies.filter(type__startswith='query-').latest('last_modified').uuid # Inject and HTML escape results if result is not None: response['result'] = result response['result']['data'] = escape_rows(result['data']) response['status'] = 0 return JsonResponse(response)
def view_results(request, id, first_row=0): """ Returns the view for the results of the QueryHistory with the given id. The query results MUST be ready. To display query results, one should always go through the execute_query view. If the result set has has_result_set=False, display an empty result. If ``first_row`` is 0, restarts (if necessary) the query read. Otherwise, just spits out a warning if first_row doesn't match the servers conception. Multiple readers will produce a confusing interaction here, and that's known. It understands the ``context`` GET parameter. (See execute_query().) """ first_row = long(first_row) start_over = (first_row == 0) results = type('Result', (object, ), { 'rows': 0, 'columns': [], 'has_more': False, 'start_row': 0, }) data = [] fetch_error = False error_message = '' log = '' columns = [] app_name = get_app_name(request) query_history = authorized_get_query_history(request, id, must_exist=True) query_server = query_history.get_query_server_config() db = dbms.get(request.user, query_server) handle, state = _get_query_handle_and_state(query_history) context_param = request.GET.get('context', '') query_context = parse_query_context(context_param) # Update the status as expired should not be accessible expired = state == models.QueryHistory.STATE.expired # Retrieve query results or use empty result if no result set try: if query_server[ 'server_name'] == 'impala' and not handle.has_result_set: downloadable = False else: results = db.fetch(handle, start_over, 100) # Materialize and HTML escape results data = escape_rows(results.rows()) # We display the "Download" button only when we know that there are results: downloadable = first_row > 0 or data log = db.get_log(handle) columns = results.data_table.cols() except Exception, ex: LOG.exception('error fetching results') fetch_error = True error_message, log = expand_exception(ex, db, handle)
def view_results(request, id, first_row=0): """ Returns the view for the results of the QueryHistory with the given id. The query results MUST be ready. To display query results, one should always go through the execute_query view. If the result set has has_result_set=False, display an empty result. If ``first_row`` is 0, restarts (if necessary) the query read. Otherwise, just spits out a warning if first_row doesn't match the servers conception. Multiple readers will produce a confusing interaction here, and that's known. It understands the ``context`` GET parameter. (See execute_query().) """ first_row = int(first_row) start_over = (first_row == 0) results = type('Result', (object,), { 'rows': 0, 'columns': [], 'has_more': False, 'start_row': 0, }) data = [] fetch_error = False error_message = '' log = '' columns = [] app_name = get_app_name(request) query_history = authorized_get_query_history(request, id, must_exist=True) query_server = query_history.get_query_server_config() db = dbms.get(request.user, query_server) handle, state = _get_query_handle_and_state(query_history) context_param = request.GET.get('context', '') query_context = parse_query_context(context_param) # Update the status as expired should not be accessible expired = state == models.QueryHistory.STATE.expired # Retrieve query results or use empty result if no result set try: if query_server['server_name'] == 'impala' and not handle.has_result_set: downloadable = False else: results = db.fetch(handle, start_over, 100) # Materialize and HTML escape results data = escape_rows(results.rows()) # We display the "Download" button only when we know that there are results: downloadable = first_row > 0 or data log = db.get_log(handle) columns = results.data_table.cols() except Exception as ex: LOG.exception('error fetching results') fetch_error = True error_message, log = expand_exception(ex, db, handle) # Handle errors error = fetch_error or results is None or expired context = { 'error': error, 'message': error_message, 'query': query_history, 'results': data, 'columns': columns, 'expected_first_row': first_row, 'log': log, 'hadoop_jobs': app_name != 'impala' and parse_out_jobs(log), 'query_context': query_context, 'can_save': False, 'context_param': context_param, 'expired': expired, 'app_name': app_name, 'next_json_set': None, 'is_finished': query_history.is_finished() } if not error: download_urls = {} if downloadable: for format in common.DL_FORMATS: download_urls[format] = reverse(app_name + ':download', kwargs=dict(id=str(id), format=format)) results.start_row = first_row context.update({ 'id': id, 'results': data, 'has_more': results.has_more, 'next_row': results.start_row + len(data), 'start_row': results.start_row, 'expected_first_row': first_row, 'columns': columns, 'download_urls': download_urls, 'can_save': query_history.owner == request.user, 'next_json_set': reverse(get_app_name(request) + ':view_results', kwargs={ 'id': str(id), 'first_row': results.start_row + len(data) } ) + ('?context=' + context_param or '') + '&format=json' }) context['columns'] = massage_columns_for_json(columns) if 'save_form' in context: del context['save_form'] if 'query' in context: del context['query'] return JsonResponse(context)
response["history_parent_uuid"] = ( history.dependencies.filter(type__startswith="query-").latest("last_modified").uuid ) except QueryError, ex: # We inject the history information from _historify() to the failed queries if response.get("history_id"): ex.extra["history_id"] = response["history_id"] if response.get("history_uuid"): ex.extra["history_uuid"] = response["history_uuid"] if response.get("history_parent_uuid"): ex.extra["history_parent_uuid"] = response["history_parent_uuid"] raise ex # Inject and HTML escape results if result is not None: response["result"] = result response["result"]["data"] = escape_rows(result["data"]) response["status"] = 0 return response @require_POST @check_document_access_permission() @api_error_handler def execute(request): notebook = json.loads(request.POST.get("notebook", "{}")) snippet = json.loads(request.POST.get("snippet", "{}")) response = _execute_notebook(request, notebook, snippet)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'localfile': path = urllib_unquote(file_format['path']) with open(path, 'r') as local_file: reader = csv.reader(local_file) csv_data = list(reader) if file_format['format']['hasHeader']: sample = csv_data[1:5] column_row = [ re.sub('[^0-9a-zA-Z]+', '_', col) for col in csv_data[0] ] else: sample = csv_data[:4] column_row = [ 'field_' + str(count + 1) for count, col in enumerate(sample[0]) ] field_type_guesses = [] for count, col in enumerate(column_row): column_samples = [ sample_row[count] for sample_row in sample if len(sample_row) > count ] field_type_guess = guess_field_type_from_samples( column_samples) field_type_guesses.append(field_type_guess) columns = [ Field(column_row[count], field_type_guesses[count]).to_dict() for count, col in enumerate(column_row) ] format_ = {'columns': columns, 'sample': sample} elif file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) if path[-3:] == 'xls' or path[-4:] == 'xlsx': path = excel_to_csv_file_name_change(path) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) LOG.debug('File %s encoding is %s' % (path, encoding)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warning( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': data = get_topic_data(request.user, file_format.get('kafkaSelectedTopics')) kafkaFieldNames = [col['name'] for col in data['full_headers']] kafkaFieldTypes = [col['type'] for col in data['full_headers']] topics_data = data['rows'] format_ = { "sample": topics_data, "columns": [ Field(col, 'string', unique=False).to_dict() for col in kafkaFieldNames ] } elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
response['history_uuid'] = history.uuid if notebook['isSaved']: # Keep track of history of saved queries response['history_parent_uuid'] = history.dependencies.filter(type__startswith='query-').latest('last_modified').uuid except QueryError, ex: # We inject the history information from _historify() to the failed queries if response.get('history_id'): ex.extra['history_id'] = response['history_id'] if response.get('history_uuid'): ex.extra['history_uuid'] = response['history_uuid'] if response.get('history_parent_uuid'): ex.extra['history_parent_uuid'] = response['history_parent_uuid'] raise ex # Inject and HTML escape results if result is not None: response['result'] = result response['result']['data'] = escape_rows(result['data']) response['status'] = 0 return response @require_POST @check_document_access_permission() @api_error_handler def execute(request): notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) response = _execute_notebook(request, notebook, snippet) return JsonResponse(response)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': if file_format.get( 'kafkaSelectedTopics') == 'NavigatorAuditEvents': kafkaFieldNames = [ 'id', 'additionalInfo', 'allowed', 'collectionName', 'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst', 'entityId', 'family', 'impersonator', 'ip', 'name', 'objectType', 'objType', 'objUsageType', 'operationParams', 'operationText', 'op', 'opText', 'path', 'perms', 'privilege', 'qualifier', 'QUERY_ID', 'resourcePath', 'service', 'SESSION_ID', 'solrVersion', 'src', 'status', 'subOperation', 'tableName', 'table', 'time', 'type', 'url', 'user' ] kafkaFieldTypes = ['string'] * len(kafkaFieldNames) kafkaFieldNames.append('timeDate') kafkaFieldTypes.append('date') else: # Note: mocked here, should come from SFDC or Kafka API or sampling job kafkaFieldNames = file_format.get('kafkaFieldNames', '').split(',') kafkaFieldTypes = file_format.get('kafkaFieldTypes', '').split(',') data = """%(kafkaFieldNames)s %(data)s""" % { 'kafkaFieldNames': ','.join(kafkaFieldNames), 'data': '\n'.join( [','.join(['...'] * len(kafkaFieldTypes))] * 5) } stream = string_io() stream.write(data) _convert_format(file_format["format"], inverse=True) indexer = MorphlineIndexer(request.user, request.fs) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) type_mapping = dict(list(zip(kafkaFieldNames, kafkaFieldTypes))) for col in format_['columns']: col['keyType'] = type_mapping[col['name']] col['type'] = type_mapping[col['name']] elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
response['status'] = 0 if async: notebook = make_notebook( name=_('Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % {'database': database, 'table': table, 'column': column}, editor_type=_get_servername(db), statement=sample_data, status='ready-execute', skip_historify=True, is_task=False, compute=cluster if cluster else None ) response['result'] = notebook.execute(request=MockedDjangoRequest(user=db.client.user), batch=False) if table_obj.is_impala_only: response['result']['type'] = 'impala' else: sample = escape_rows(sample_data.rows(), nulls_only=True) if column: sample = set([row[0] for row in sample]) sample = [[item] for item in sorted(list(sample))] response['headers'] = sample_data.cols() response['full_headers'] = sample_data.full_cols() response['rows'] = sample else: response['message'] = _('Failed to get sample data.') return response @error_handler def get_indexes(request, database, table):
'history_parent_uuid'] = history.dependencies.filter( type__startswith='query-').latest( 'last_modified').uuid except QueryError, ex: # We inject the history information from _historify() to the failed queries if response.get('history_id'): ex.extra['history_id'] = response['history_id'] if response.get('history_uuid'): ex.extra['history_uuid'] = response['history_uuid'] if response.get('history_parent_uuid'): ex.extra['history_parent_uuid'] = response['history_parent_uuid'] raise ex # Inject and HTML escape results if result is not None: response['result'] = result response['result']['data'] = escape_rows(result['data']) response['status'] = 0 return response @require_POST @check_document_access_permission() @api_error_handler def execute(request, engine=None): notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) response = _execute_notebook(request, notebook, snippet)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib.unquote(file_format["path"]) stream = request.fs.open(path) encoding = chardet.detect(stream.read(10000)).get('encoding') stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception, e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, }
def _execute_notebook(request, notebook, snippet): response = {'status': -1} result = None history = None historify = (notebook['type'] != 'notebook' or snippet.get('wasBatchExecuted') ) and not notebook.get('skipHistorify') try: try: sessions = notebook.get('sessions') and notebook[ 'sessions'] # Session reference for snippet execution without persisting it active_executable = json.loads(request.POST.get( 'executable', '{}')) # Editor v2 # TODO: Use statement, database etc. from active_executable if historify: history = _historify(notebook, request.user) notebook = Notebook(document=history).get_data() interpreter = get_api(request, snippet) if snippet.get('interface') == 'sqlalchemy': interpreter.options['session'] = sessions[0] with opentracing.tracer.start_span('interpreter') as span: # interpreter.execute needs the sessions, but we don't want to persist them pre_execute_sessions = notebook['sessions'] notebook['sessions'] = sessions response['handle'] = interpreter.execute(notebook, snippet) notebook['sessions'] = pre_execute_sessions # Retrieve and remove the result from the handle if response['handle'].get('sync'): result = response['handle'].pop('result') finally: if historify: _snippet = [ s for s in notebook['snippets'] if s['id'] == snippet['id'] ][0] if 'id' in active_executable: # Editor v2 # notebook_executable is the 1-to-1 match of active_executable in the notebook structure notebook_executable = [ e for e in _snippet['executor']['executables'] if e['id'] == active_executable['id'] ][0] if 'handle' in response: notebook_executable['handle'] = response['handle'] if history: notebook_executable['history'] = { 'id': history.id, 'uuid': history.uuid } notebook_executable['operationId'] = history.uuid if 'handle' in response: # No failure if 'result' not in _snippet: # Editor v2 _snippet['result'] = {} _snippet['result']['handle'] = response['handle'] _snippet['result']['statements_count'] = response[ 'handle'].get('statements_count', 1) _snippet['result']['statement_id'] = response[ 'handle'].get('statement_id', 0) _snippet['result']['handle']['statement'] = response[ 'handle'].get('statement', snippet['statement']).strip( ) # For non HS2, as non multi query yet else: _snippet['status'] = 'failed' if history: # If _historify failed, history will be None. If we get Atomic block exception, something underneath interpreter.execute() crashed and is not handled. history.update_data(notebook) history.save() response['history_id'] = history.id response['history_uuid'] = history.uuid if notebook[ 'isSaved']: # Keep track of history of saved queries response[ 'history_parent_uuid'] = history.dependencies.filter( type__startswith='query-').latest( 'last_modified').uuid except QueryError as ex: # We inject the history information from _historify() to the failed queries if response.get('history_id'): ex.extra['history_id'] = response['history_id'] if response.get('history_uuid'): ex.extra['history_uuid'] = response['history_uuid'] if response.get('history_parent_uuid'): ex.extra['history_parent_uuid'] = response['history_parent_uuid'] raise ex # Inject and HTML escape results if result is not None: response['result'] = result response['result']['data'] = escape_rows(result['data']) response['status'] = 0 return response
def _execute_notebook(request, notebook, snippet): response = {'status': -1} result = None history = None historify = (notebook['type'] != 'notebook' or snippet.get('wasBatchExecuted') ) and not notebook.get('skipHistorify') try: try: session = notebook.get('sessions') and notebook['sessions'][ 0] # Session reference for snippet execution without persisting it if historify: history = _historify(notebook, request.user) notebook = Notebook(document=history).get_data() interpreter = get_api(request, snippet) if snippet.get('interface') == 'sqlalchemy': interpreter.options['session'] = session response['handle'] = interpreter.execute(notebook, snippet) # Retrieve and remove the result from the handle if response['handle'].get('sync'): result = response['handle'].pop('result') finally: if historify: _snippet = [ s for s in notebook['snippets'] if s['id'] == snippet['id'] ][0] if 'handle' in response: # No failure _snippet['result']['handle'] = response['handle'] _snippet['result']['statements_count'] = response[ 'handle'].get('statements_count', 1) _snippet['result']['statement_id'] = response[ 'handle'].get('statement_id', 0) _snippet['result']['handle']['statement'] = response[ 'handle'].get('statement', snippet['statement']).strip( ) # For non HS2, as non multi query yet else: _snippet['status'] = 'failed' if history: # If _historify failed, history will be None. If we get Atomic block exception, something underneath interpreter.execute() crashed and is not handled. history.update_data(notebook) history.save() response['history_id'] = history.id response['history_uuid'] = history.uuid if notebook[ 'isSaved']: # Keep track of history of saved queries response[ 'history_parent_uuid'] = history.dependencies.filter( type__startswith='query-').latest( 'last_modified').uuid except QueryError as ex: # We inject the history information from _historify() to the failed queries if response.get('history_id'): ex.extra['history_id'] = response['history_id'] if response.get('history_uuid'): ex.extra['history_uuid'] = response['history_uuid'] if response.get('history_parent_uuid'): ex.extra['history_parent_uuid'] = response['history_parent_uuid'] raise ex # Inject and HTML escape results if result is not None: response['result'] = result response['result']['data'] = escape_rows(result['data']) response['status'] = 0 return response
table=None, column=None, async=False, operation=None): engine = self._create_engine() inspector = inspect(engine) assist = Assist(inspector, engine, backticks=self.backticks) response = {'status': -1, 'result': {}} metadata, sample_data = assist.get_sample_data(database, table, column) has_result_set = sample_data is not None if sample_data: response['status'] = 0 response['rows'] = escape_rows(sample_data) if table: columns = assist.get_columns(database, table) response['full_headers'] = [{ 'name': col.get('name'), 'type': str(col.get('type')), 'comment': '' } for col in columns] elif metadata: response['full_headers'] = [{ 'name': col[0] if type(col) is dict or type(col) is tuple else col, 'type': 'STRING_TYPE', 'comment':