def action_parameters(request): response = {'status': -1} parameters = set() try: node_data = json.loads(request.POST.get('node', '{}')) parameters = parameters.union(set(Node(node_data).find_parameters())) script_path = node_data.get('properties', {}).get('script_path', {}) if script_path: script_path = script_path.replace('hdfs://', '') if request.fs.do_as_user(request.user, request.fs.exists, script_path): data = request.fs.do_as_user(request.user, request.fs.read, script_path, 0, 16 * 1024 ** 2) if node_data['type'] in ('hive', 'hive2'): parameters = parameters.union(set(find_dollar_braced_variables(data))) elif node_data['type'] == 'pig': parameters = parameters.union(set(find_dollar_variables(data))) elif node_data['type'] == 'hive-document': notebook = Notebook(document=Document2.objects.get_by_uuid(user=request.user, uuid=node_data['properties']['uuid'])) parameters = parameters.union(set(find_dollar_braced_variables(notebook.get_str()))) response['status'] = 0 response['parameters'] = list(parameters) except Exception, e: response['message'] = str(e)
def get_history(request): response = {'status': -1} doc_type = request.GET.get('doc_type') limit = max(request.GET.get('len', 50), 100) response['status'] = 0 history = [] for doc in Document2.objects.get_history(doc_type='query-%s' % doc_type, user=request.user).order_by('-last_modified')[:limit]: notebook = Notebook(document=doc).get_data() if 'snippets' in notebook: history.append({ 'name': doc.name, 'id': doc.id, 'uuid': doc.uuid, 'type': doc.type, 'data': { 'statement_raw': notebook['snippets'][0]['statement_raw'][:1001], 'lastExecuted': notebook['snippets'][0]['lastExecuted'], 'status': notebook['snippets'][0]['status'], 'parentUuid': notebook.get('parentUuid', '') } if notebook['snippets'] else {}, 'absoluteUrl': doc.get_absolute_url(), }) else: LOG.error('Incomplete History Notebook: %s' % notebook) response['history'] = history response['message'] = _('History fetched') return JsonResponse(response)
def notebook(request): notebook_id = request.GET.get("notebook") if notebook_id: notebook = Notebook(document=Document2.objects.get(id=notebook_id)) else: notebook = Notebook() autocomplete_base_url = "" try: autocomplete_base_url = reverse("beeswax:api_autocomplete_databases", kwargs={}) except: LOG.exception("failed to get autocomplete base url") is_yarn_mode = False try: from spark.conf import LIVY_SERVER_SESSION_KIND is_yarn_mode = LIVY_SERVER_SESSION_KIND.get() except: LOG.exception("Spark is not enabled") return render( "notebook.mako", request, { "notebooks_json": json.dumps([notebook.get_data()]), "options_json": json.dumps( {"languages": get_interpreters(request.user), "session_properties": SparkApi.PROPERTIES} ), "autocomplete_base_url": autocomplete_base_url, "is_yarn_mode": is_yarn_mode, }, )
def notebook(request): notebook_id = request.GET.get('notebook') if notebook_id: notebook = Notebook(document=Document2.objects.get(id=notebook_id)) else: notebook = Notebook() autocomplete_base_url = '' try: autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={}) except: LOG.exception('failed to get autocomplete base url') return render('notebook.mako', request, { 'notebooks_json': json.dumps([notebook.get_data()]), 'options_json': json.dumps({ 'languages': get_interpreters(), 'snippet_placeholders' : { 'sql': _('Example: 1 + 1, or press CTRL + space'), 'spark': _('Example: 1 + 1, or press CTRL + space'), 'pyspark': _('Example: 1 + 1, or press CTRL + space'), 'impala': _('Example: SELECT * FROM tablename, or press CTRL + space'), 'hive': _('Example: SELECT * FROM tablename, or press CTRL + space'), 'r': _('Example: 1 + 1, or press CTRL + space') }, 'session_properties': SparkApi.PROPERTIES }), 'autocomplete_base_url': autocomplete_base_url, 'is_yarn_mode': LIVY_SERVER_SESSION_KIND.get() })
def editor(request): editor_id = request.GET.get('editor') if editor_id: editor = Notebook(document=Document2.objects.get(id=editor_id)) else: editor = Notebook() data = editor.get_data() data['name'] = 'Hive SQL Editor' data['snippets'] = json.loads('[{"id":"c111cbb4-f475-4050-c5a1-02df6c31e3d8","name":"","type":"hive","editorMode":"text/x-hiveql","statement_raw":"Example: SELECT * FROM tablename, or press CTRL + space","codemirrorSize":100,"status":"ready","properties":{"settings":[],"files":[]},"variables":[],"variableNames":[],"statement":"Example: SELECT * FROM tablename, or press CTRL + space","result":{"id":"149347d9-3ae7-8d06-4cc8-d4bce5e72dc8","type":"table","hasResultset":true,"handle":{},"meta":[],"cleanedMeta":[],"fetchedOnce":false,"startTime":"2015-07-17T20:38:21.970Z","endTime":"2015-07-17T20:38:21.970Z","executionTime":0,"cleanedNumericMeta":[],"cleanedStringMeta":[],"cleanedDateTimeMeta":[],"data":[],"logs":"","logLines":0,"errors":"","hasSomeResults":false},"showGrid":true,"showChart":false,"showLogs":false,"progress":0,"size":12,"offset":0,"isLoading":false,"klass":"snippet card card-widget","editorKlass":"editor span12","resultsKlass":"results hive","errorsKlass":"results hive alert alert-error","chartType":"bars","chartSorting":"none","chartYMulti":[],"chartData":[],"tempChartOptions":{},"isLeftPanelVisible":false,"codeVisible":true,"settingsVisible":false,"checkStatusTimeout":null}]') editor.data = json.dumps(data) autocomplete_base_url = '' try: autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={}) except: LOG.exception('failed to get autocomplete base url') return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "Hive SQL", "type": "hive"}], 'snippet_placeholders' : { 'scala': _('Example: 1 + 1, or press CTRL + space'), 'python': _('Example: 1 + 1, or press CTRL + space'), 'impala': _('Example: SELECT * FROM tablename, or press CTRL + space'), 'hive': _('Example: SELECT * FROM tablename, or press CTRL + space'), 'text': _('<h2>This is a text snippet</h2>Type your text here') } }), 'autocomplete_base_url': autocomplete_base_url, })
def execute_and_watch(request): notebook_id = request.GET.get('editor', request.GET.get('notebook')) snippet_id = int(request.GET['snippet']) action = request.GET['action'] destination = request.GET['destination'] notebook = Notebook(document=Document2.objects.get(id=notebook_id)) snippet = notebook.get_data()['snippets'][snippet_id] editor_type = snippet['type'] api = get_api(request, snippet) if action == 'save_as_table': sql, success_url = api.export_data_as_table(snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') elif action == 'insert_as_query': sql, success_url = api.export_large_data_to_hdfs(snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') else: raise PopupException(_('Action %s is unknown') % action) return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', 'success_url': success_url }), 'editor_type': editor_type, })
def notebook(request): notebook_id = request.GET.get('notebook') if notebook_id: notebook = Notebook(document=Document2.objects.get(id=notebook_id)) else: notebook = Notebook() autocomplete_base_url = '' try: autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={}) except: LOG.exception('failed to get autocomplete base url') is_yarn_mode = False try: from spark.conf import LIVY_SERVER_SESSION_KIND is_yarn_mode = LIVY_SERVER_SESSION_KIND.get() except: LOG.exception('Spark is not enabled') return render('notebook.mako', request, { 'notebooks_json': json.dumps([notebook.get_data()]), 'options_json': json.dumps({ 'languages': get_interpreters(request.user), 'session_properties': SparkApi.PROPERTIES, }), 'autocomplete_base_url': autocomplete_base_url, 'is_yarn_mode': is_yarn_mode })
def editor(request): editor_id = request.GET.get('editor') editor_type = request.GET.get('type', 'hive') if editor_id: editor = Notebook(document=Document2.objects.get(id=editor_id)) else: editor = Notebook() data = editor.get_data() data['name'] = 'Untitled %s Query' % editor_type.title() data['type'] = 'query-%s' % editor_type editor.data = json.dumps(data) autocomplete_base_url = '' try: autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={}) except: LOG.exception('failed to get autocomplete base url') return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', }), 'editor_type': editor_type, 'autocomplete_base_url': autocomplete_base_url, })
def make_notebook2(name='Browse', description='', is_saved=False, snippets=None): from notebook.connectors.hiveserver2 import HS2Api editor = Notebook() _snippets = [] for snippet in snippets: default_properties = { 'files': [], 'functions': [], 'settings': [] } if snippet['type'] == 'hive': pass elif snippet['type'] == 'impala': pass elif snippet['type'] == 'java': pass _snippets.append(snippet) print _snippets data = { 'name': name, 'uuid': str(uuid.uuid4()), 'description': description, 'sessions': [ { 'type': _snippet['type'], 'properties': HS2Api.get_properties(snippet['type']), 'id': None } for _snippet in _snippets # Non unique types currently ], 'selectedSnippet': _snippets[0]['type'], 'type': 'notebook', 'showHistory': False, 'isSaved': is_saved, 'snippets': [ { 'status': _snippet.get('status', 'ready'), 'id': str(uuid.uuid4()), 'statement_raw': _snippet.get('statement', ''), 'statement': _snippet.get('statement', ''), 'type': _snippet.get('type'), 'properties': _snippet.properties, 'name': name, 'database': _snippet.get('database'), 'result': {} } for _snippet in _snippets ] } editor.data = json.dumps(data) return editor
def open_notebook(request): response = {"status": -1} notebook_id = request.GET.get("notebook") notebook = Notebook(document=Document2.objects.get(id=notebook_id)) response["status"] = 0 response["notebook"] = notebook.get_json() response["message"] = _("Notebook loaded successfully")
def open_notebook(request): response = {'status': -1} notebook_id = request.GET.get('notebook') notebook = Notebook(document=Document2.objects.get(id=notebook_id)) response['status'] = 0 response['notebook'] = notebook.get_json() response['message'] = _('Notebook loaded successfully')
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready', files=None, functions=None, settings=None): from notebook.connectors.hiveserver2 import HS2Api editor = Notebook() properties = HS2Api.get_properties(editor_type) if editor_type == 'hive': if files is not None: _update_property_value(properties, 'files', files) if functions is not None: _update_property_value(properties, 'functions', functions) if settings is not None: _update_property_value(properties, 'settings', settings) elif editor_type == 'impala': if settings is not None: _update_property_value(properties, 'files', files) editor.data = json.dumps({ 'name': name, 'description': description, 'sessions': [ { 'type': editor_type, 'properties': properties, 'id': None } ], 'selectedSnippet': editor_type, 'type': 'query-%s' % editor_type, 'showHistory': True, 'snippets': [ { 'status': status, 'id': str(uuid.uuid4()), 'statement_raw': statement, 'statement': statement, 'type': editor_type, 'properties': { 'files': [] if files is None else files, 'functions': [] if functions is None else functions, 'settings': [] if settings is None else settings }, 'name': name, 'database': 'default', 'result': {} } ] }) return editor
def open_notebook(request): response = {'status': -1} notebook_id = request.GET.get('notebook') notebook = Notebook(document=Document2.objects.get(id=notebook_id)) response['status'] = 0 response['notebook'] = notebook.get_json() response['message'] = _('Notebook saved !') return JsonResponse(response)
def extract_archive_in_hdfs(request, upload_path, file_name): _upload_extract_archive_script_to_hdfs(request.fs) shell_notebook = Notebook() shell_notebook.add_shell_snippet( shell_command='extract_archive_in_hdfs.sh', arguments=[{'value': '-u=' + upload_path}, {'value': '-f=' + file_name}], archives=[], files=[{'value': '/user/' + DEFAULT_USER.get() + '/common/extract_archive_in_hdfs.sh'}, {"value": upload_path + '/' + file_name}], env_var=[{'value': 'HADOOP_USER_NAME=${wf:user()}'}]) return shell_notebook.execute(request, batch=True)
def browse(request, database, table): editor_type = request.GET.get('type', 'hive') snippet = {'type': editor_type} sql_select = get_api(request.user, snippet, request.fs, request.jt).get_select_star_query(snippet, database, table) editor = Notebook() editor.data = json.dumps({ 'description':'', 'sessions':[ { 'type':'hive', 'properties':[ ], 'id':None } ], 'selectedSnippet':'hive', 'type': 'query-%s' % editor_type, 'snippets':[ { 'status':'ready-execute', 'id':'e8b323b3-88ef-3a84-6264-af11fa5fbefb', 'statement_raw': sql_select, 'statement': sql_select, 'type': editor_type, 'properties':{ 'files':[ ], 'settings':[ ] }, 'name': 'Browse', 'database':'default', 'result':{ } } ], 'name':'Browse' }) return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', }), 'editor_type': editor_type, })
def get_external_statement(request): response = {'status': -1, 'message': ''} notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) if snippet.get('statementType') == 'file': response['statement'] = _get_statement_from_file(request.user, request.fs, snippet) elif snippet.get('statementType') == 'document': notebook = Notebook(Document2.objects.get_by_uuid(user=request.user, uuid=snippet['associatedDocumentUuid'], perm_type='read')) response['statement'] = notebook.get_str() response['status'] = 0 return JsonResponse(response)
def get_history(request): response = {"status": -1} doc_type = request.GET.get("doc_type") doc_text = request.GET.get("doc_text") limit = min(request.GET.get("len", 50), 100) docs = Document2.objects.get_history(doc_type="query-%s" % doc_type, user=request.user) if doc_text: docs = docs.filter( Q(name__icontains=doc_text) | Q(description__icontains=doc_text) | Q(search__icontains=doc_text) ) history = [] for doc in docs.order_by("-last_modified")[:limit]: notebook = Notebook(document=doc).get_data() if "snippets" in notebook: statement = _get_statement(notebook) history.append( { "name": doc.name, "id": doc.id, "uuid": doc.uuid, "type": doc.type, "data": { "statement": statement[:1001] if statement else "", "lastExecuted": notebook["snippets"][0]["lastExecuted"], "status": notebook["snippets"][0]["status"], "parentSavedQueryUuid": notebook.get("parentSavedQueryUuid", ""), } if notebook["snippets"] else {}, "absoluteUrl": doc.get_absolute_url(), } ) else: LOG.error("Incomplete History Notebook: %s" % notebook) response["history"] = sorted(history, key=lambda row: row["data"]["lastExecuted"], reverse=True) response["message"] = _("History fetched") response["status"] = 0 return JsonResponse(response)
def get_history(request): response = {'status': -1} doc_type = request.GET.get('doc_type') doc_text = request.GET.get('doc_text') limit = min(request.GET.get('len', 50), 100) docs = Document2.objects.get_history(doc_type='query-%s' % doc_type, user=request.user) if doc_text: docs = docs.filter(Q(name__icontains=doc_text) | Q(description__icontains=doc_text)) history = [] for doc in docs.order_by('-last_modified')[:limit]: notebook = Notebook(document=doc).get_data() if 'snippets' in notebook: try: statement = notebook['snippets'][0]['result']['handle']['statement'] if type(statement) == dict: # Old format statement = notebook['snippets'][0]['statement_raw'] except KeyError: # Old format statement = notebook['snippets'][0]['statement_raw'] history.append({ 'name': doc.name, 'id': doc.id, 'uuid': doc.uuid, 'type': doc.type, 'data': { 'statement': statement[:1001] if statement else '', 'lastExecuted': notebook['snippets'][0]['lastExecuted'], 'status': notebook['snippets'][0]['status'], 'parentSavedQueryUuid': notebook.get('parentSavedQueryUuid', '') } if notebook['snippets'] else {}, 'absoluteUrl': doc.get_absolute_url(), }) else: LOG.error('Incomplete History Notebook: %s' % notebook) response['history'] = sorted(history, key=lambda row: row['data']['lastExecuted'], reverse=True) response['message'] = _('History fetched') response['status'] = 0 return JsonResponse(response)
def create_notebook(request): response = {'status': -1} editor_type = request.POST.get('type', 'notebook') directory_uuid = request.POST.get('directory_uuid') editor = Notebook() data = editor.get_data() if editor_type != 'notebook': data['name'] = '' data['type'] = 'query-%s' % editor_type # TODO: Add handling for non-SQL types data['directoryUuid'] = directory_uuid editor.data = json.dumps(data) response['notebook'] = editor.get_data() response['status'] = 0 return JsonResponse(response)
def create_notebook(request): response = {"status": -1} editor_type = request.POST.get("type", "notebook") directory_uuid = request.POST.get("directory_uuid") editor = Notebook() data = editor.get_data() if editor_type != "notebook": data["name"] = "" data["type"] = "query-%s" % editor_type # TODO: Add handling for non-SQL types data["directoryUuid"] = directory_uuid editor.data = json.dumps(data) response["notebook"] = editor.get_data() response["status"] = 0 return JsonResponse(response)
def editor(request): editor_id = request.GET.get('editor') editor_type = request.GET.get('type', 'hive') if editor_id: editor = Notebook(document=Document2.objects.get(id=editor_id)) else: editor = Notebook() data = editor.get_data() data['name'] = 'Unsaved %s Query' % editor_type.title() data['type'] = 'query-%s' % editor_type editor.data = json.dumps(data) return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', }), 'editor_type': editor_type, })
def editor(request): editor_id = request.GET.get('editor') editor_type = request.GET.get('type', 'hive') if editor_id: # Open existing saved editor document editor = Notebook(document=Document2.objects.get(id=editor_id)) editor_type = editor.get_data()['type'].rsplit('-', 1)[-1] editor = upgrade_session_properties(request, notebook=editor) else: # Create new editor editor = Notebook() data = editor.get_data() data['name'] = '' data['type'] = 'query-%s' % editor_type # TODO: Add handling for non-SQL types editor.data = json.dumps(data) return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "%s SQL" % editor_type.title(), "type": editor_type}], 'mode': 'editor', 'is_optimizer_enabled': has_optimizer(), }), 'editor_type': editor_type, })
def extract_archive_in_hdfs(request, upload_path, file_name): _upload_extract_archive_script_to_hdfs(request.fs) output_path = upload_path + '/' + file_name.split('.')[0] start_time = json.loads(request.POST.get('start_time', '-1')) shell_notebook = Notebook( name=_('HDFS Extraction of %(upload_path)s/%(file_name)s') % {'upload_path': upload_path, 'file_name': file_name}, isManaged=True, onSuccessUrl=reverse('filebrowser.views.view', kwargs={'path': output_path}) ) shell_notebook.add_shell_snippet( shell_command='extract_archive_in_hdfs.sh', arguments=[{'value': '-u=' + upload_path}, {'value': '-f=' + file_name}, {'value': '-o=' + output_path}], archives=[], files=[{'value': '/user/' + DEFAULT_USER.get() + '/common/extract_archive_in_hdfs.sh'}, {"value": upload_path + '/' + urllib.quote(file_name)}], env_var=[{'value': 'HADOOP_USER_NAME=${wf:user()}'}], last_executed=start_time ) return shell_notebook.execute(request, batch=True)
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready', files=None, functions=None, settings=None): editor = Notebook() editor.data = json.dumps({ 'name': name, 'description': description, 'sessions': [ { 'type': editor_type, 'properties': [ ], 'id': None } ], 'selectedSnippet': editor_type, 'type': 'query-%s' % editor_type, 'showHistory': True, 'snippets': [ { 'status': status, 'id': str(uuid.uuid4()), 'statement_raw': statement, 'statement': statement, 'type': editor_type, 'properties': { 'files': [] if files is None else files, 'functions': [] if functions is None else functions, 'settings': [] if settings is None else settings }, 'name': name, 'database': 'default', 'result': {} } ] }) return editor
def notebook(request): notebook_id = request.GET.get('notebook') if notebook_id: notebook = Notebook(document=Document2.objects.get(id=notebook_id)) else: notebook = Notebook() is_yarn_mode = False try: from spark.conf import LIVY_SERVER_SESSION_KIND is_yarn_mode = LIVY_SERVER_SESSION_KIND.get() except: LOG.exception('Spark is not enabled') return render('notebook.mako', request, { 'notebooks_json': json.dumps([notebook.get_data()]), 'options_json': json.dumps({ 'languages': get_interpreters(request.user), 'session_properties': SparkApi.PROPERTIES, }), 'is_yarn_mode': is_yarn_mode })
def compress_files_in_hdfs(request, file_names, upload_path, archive_name): _upload_compress_files_script_to_hdfs(request.fs) files = [{"value": upload_path + '/' + file_name} for file_name in file_names] files.append({'value': '/user/' + DEFAULT_USER.get() + '/common/compress_files_in_hdfs.sh'}) start_time = json.loads(request.POST.get('start_time', '-1')) shell_notebook = Notebook( name=_('HDFS Compression to %(upload_path)s/hue_compressed.zip') % {'upload_path': upload_path}, isManaged=True, onSuccessUrl=reverse('filebrowser.views.view', kwargs={'path': upload_path}) ) shell_notebook.add_shell_snippet( shell_command='compress_files_in_hdfs.sh', arguments=[{'value': '-u=' + upload_path}, {'value': '-f=' + ','.join(file_names)}, {'value': '-n=' + archive_name}], archives=[], files=files, env_var=[{'value': 'HADOOP_USER_NAME=${wf:user()}'}], last_executed=start_time ) return shell_notebook.execute(request, batch=True)
def editor(request): editor_id = request.GET.get('editor') if editor_id: editor = Notebook(document=Document2.objects.get(id=editor_id)) else: editor = Notebook() data = editor.get_data() data['name'] = 'My SQL query' editor.data = json.dumps(data) autocomplete_base_url = '' try: autocomplete_base_url = reverse('beeswax:api_autocomplete_databases', kwargs={}) except: LOG.exception('failed to get autocomplete base url') return render('editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{"name": "Hive SQL", "type": "hive"}] }), 'autocomplete_base_url': autocomplete_base_url, })
def _get_statement(notebook): if notebook['snippets'] and len(notebook['snippets']) > 0: return Notebook.statement_with_variables(notebook['snippets'][0]) return ''
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready', files=None, functions=None, settings=None, is_saved=False, database='default', snippet_properties=None, batch_submit=False, on_success_url=None, skip_historify=False, is_task=False, last_executed=-1, is_notebook=False, pub_sub_url=None): ''' skip_historify: do not add the task to the query history. e.g. SQL Dashboard isManaged: true when being a managed by Hue operation (include_managed=True in document), e.g. exporting query result, dropping some tables ''' from notebook.connectors.hiveserver2 import HS2Api editor = Notebook() if snippet_properties is None: snippet_properties = {} if editor_type == 'hive': sessions_properties = HS2Api.get_properties(editor_type) if files is not None: _update_property_value(sessions_properties, 'files', files) if functions is not None: _update_property_value(sessions_properties, 'functions', functions) if settings is not None: _update_property_value(sessions_properties, 'settings', settings) elif editor_type == 'impala': sessions_properties = HS2Api.get_properties(editor_type) if settings is not None: _update_property_value(sessions_properties, 'files', files) elif editor_type == 'java': sessions_properties = [] # Java options else: sessions_properties = [] data = { 'name': name, 'uuid': str(uuid.uuid4()), 'description': description, 'sessions': [{ 'type': editor_type, 'properties': sessions_properties, 'id': None }], 'selectedSnippet': editor_type, 'type': 'notebook' if is_notebook else 'query-%s' % editor_type, 'showHistory': True, 'isSaved': is_saved, 'onSuccessUrl': on_success_url, 'pubSubUrl': pub_sub_url, 'skipHistorify': skip_historify, 'isManaged': is_task, 'snippets': [{ 'status': status, 'id': str(uuid.uuid4()), 'statement_raw': statement, 'statement': statement, 'type': editor_type, 'wasBatchExecuted': batch_submit, 'lastExecuted': last_executed, 'properties': { 'files': [] if files is None else files, 'functions': [] if functions is None else functions, 'settings': [] if settings is None else settings }, 'name': name, 'database': database, 'result': { 'handle': {} }, 'variables': [] }] if not is_notebook else [] } if snippet_properties: data['snippets'][0]['properties'].update(snippet_properties) editor.data = json.dumps(data) return editor
def _set_search_field(notebook_doc): notebook = Notebook(document=notebook_doc).get_data() statement = _get_statement(notebook) notebook_doc.search = statement return notebook_doc
def make_notebook2(name='Browse', description='', is_saved=False, snippets=None): from notebook.connectors.hiveserver2 import HS2Api editor = Notebook() _snippets = [] for snippet in snippets: default_properties = {'files': [], 'functions': [], 'settings': []} default_properties.update(snippet['properties']) snippet['properties'] = default_properties if snippet['type'] == 'hive': pass elif snippet['type'] == 'impala': pass elif snippet['type'] == 'java': pass _snippets.append(snippet) data = { 'name': name, 'uuid': str(uuid.uuid4()), 'type': 'notebook', 'description': description, 'sessions': [ { 'type': _snippet['type'], 'properties': HS2Api.get_properties(snippet['type']), 'id': None } for _snippet in _snippets # Non unique types currently ], 'selectedSnippet': _snippets[0]['type'], 'type': 'notebook', 'showHistory': False, 'isSaved': is_saved, 'snippets': [{ 'status': _snippet.get('status', 'ready'), 'id': str(uuid.uuid4()), 'statement_raw': _snippet.get('statement', ''), 'statement': _snippet.get('statement', ''), 'type': _snippet.get('type'), 'properties': _snippet['properties'], 'name': name, 'database': _snippet.get('database'), 'result': {}, 'variables': [] } for _snippet in _snippets] } editor.data = json.dumps(data) return editor
def execute_and_watch(request): notebook_id = request.GET.get('editor', request.GET.get('notebook')) snippet_id = int(request.GET['snippet']) action = request.GET['action'] destination = request.GET['destination'] notebook = Notebook(document=Document2.objects.get( id=notebook_id)).get_data() snippet = notebook['snippets'][snippet_id] editor_type = snippet['type'] api = get_api(request, snippet) if action == 'save_as_table': sql, success_url = api.export_data_as_table(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database']) elif action == 'insert_as_query': # TODO: checks/workarounds in case of non impersonation or Sentry # TODO: keep older simpler way in case of known not many rows? sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url) elif action == 'index_query': if destination == '__hue__': destination = _get_snippet_name(notebook, unique=True, table_format=True) live_indexing = True else: live_indexing = False sql, success_url = api.export_data_as_table(notebook, snippet, destination, is_temporary=True, location='') editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True) from indexer.api3 import _index # Will ve moved to the lib from indexer.file_format import HiveFormat from indexer.fields import Field file_format = { 'name': 'col', 'inputFormat': 'query', 'format': { 'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001' }, "sample": '', "columns": [ Field( col['name'].rsplit('.')[-1], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample['meta'] ] } if live_indexing: file_format['inputFormat'] = 'hs2_handle' file_format['fetch_handle'] = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over) job_handle = _index(request, file_format, destination, query=notebook['uuid']) if live_indexing: return redirect( reverse('search:browse', kwargs={'name': destination})) else: return redirect( reverse('oozie:list_oozie_workflow', kwargs={'job_id': job_handle['handle']['id']})) else: raise PopupException(_('Action %s is unknown') % action) return render( 'editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{ "name": "%s SQL" % editor_type.title(), "type": editor_type }], 'mode': 'editor', 'editor_type': editor_type, 'success_url': success_url }), 'editor_type': editor_type, })
def execute_and_watch(request): notebook_id = request.GET.get('editor', request.GET.get('notebook')) snippet_id = int(request.GET['snippet']) action = request.GET['action'] destination = request.GET['destination'] notebook = Notebook(document=Document2.objects.get( id=notebook_id)).get_data() snippet = notebook['snippets'][snippet_id] editor_type = snippet['type'] api = get_api(request, snippet) if action == 'save_as_table': sql, success_url = api.export_data_as_table(notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database']) elif action == 'insert_as_query': sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute', database=snippet['database']) elif action == 'index_query': sql, success_url = api.export_data_as_table(notebook, snippet, destination, is_temporary=True, location='') editor = make_notebook(name='Execute and watch', editor_type=editor_type, statement=sql, status='ready-execute') sample = get_api(request, snippet).fetch_result(notebook, snippet, 0, start_over=True) from indexer.api3 import _index # Will ve moved to the lib in next commit from indexer.file_format import HiveFormat from indexer.fields import Field file_format = { 'name': 'col', 'inputFormat': 'query', 'format': { 'quoteChar': '"', 'recordSeparator': '\n', 'type': 'csv', 'hasHeader': False, 'fieldSeparator': '\u0001' }, "sample": '', "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample['meta'] ] } job_handle = _index(request, file_format, destination, query=notebook['uuid']) return redirect( reverse('oozie:list_oozie_workflow', kwargs={'job_id': job_handle['handle']['id']})) else: raise PopupException(_('Action %s is unknown') % action) return render( 'editor.mako', request, { 'notebooks_json': json.dumps([editor.get_data()]), 'options_json': json.dumps({ 'languages': [{ "name": "%s SQL" % editor_type.title(), "type": editor_type }], 'mode': 'editor', 'editor_type': editor_type, 'success_url': success_url }), 'editor_type': editor_type, })
def _get_document_helper(request, uuid, with_data, with_dependencies, path): if uuid: if uuid.isdigit(): document = Document2.objects.document(user=request.user, doc_id=uuid) else: document = Document2.objects.get_by_uuid(user=request.user, uuid=uuid) else: # Find by path document = Document2.objects.get_by_path(user=request.user, path=path) response = { 'document': document.to_dict(), 'parent': document.parent_directory.to_dict() if document.parent_directory else None, 'children': [], 'dependencies': [], 'dependents': [], 'data': '', 'status': 0 } response['user_perms'] = { 'can_read': document.can_read(request.user), 'can_write': document.can_write(request.user) } if with_data: data = json.loads(document.data) # Upgrade session properties for Hive and Impala if document.type.startswith('query'): from notebook.models import upgrade_session_properties notebook = Notebook(document=document) notebook = upgrade_session_properties(request, notebook) data = json.loads(notebook.data) if document.type == 'query-pig': # Import correctly from before Hue 4.0 properties = data['snippets'][0]['properties'] if 'hadoopProperties' not in properties: properties['hadoopProperties'] = [] if 'parameters' not in properties: properties['parameters'] = [] if 'resources' not in properties: properties['resources'] = [] if data.get('uuid') != document.uuid: # Old format < 3.11 data['uuid'] = document.uuid response['data'] = data if with_dependencies: response['dependencies'] = [ dependency.to_dict() for dependency in document.dependencies.all() ] response['dependents'] = [ dependent.to_dict() for dependent in document.dependents.exclude( is_history=True).all() ] # Get children documents if this is a directory if document.is_directory: directory = Directory.objects.get(id=document.id) # If this is the user's home directory, fetch shared docs too if document.is_home_directory: children = directory.get_children_and_shared_documents( user=request.user) response.update( _filter_documents(request, queryset=children, flatten=True)) else: children = directory.get_children_documents() response.update( _filter_documents(request, queryset=children, flatten=False)) # Paginate and serialize Results if 'documents' in response: response.update(_paginate(request, queryset=response['documents'])) # Rename documents to children response['children'] = response.pop('documents') response['children'] = [doc.to_dict() for doc in response['children']] return response
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready', files=None, functions=None, settings=None, is_saved=False, database='default'): from notebook.connectors.hiveserver2 import HS2Api editor = Notebook() properties = HS2Api.get_properties(editor_type) if editor_type == 'hive': if files is not None: _update_property_value(properties, 'files', files) if functions is not None: _update_property_value(properties, 'functions', functions) if settings is not None: _update_property_value(properties, 'settings', settings) elif editor_type == 'impala': if settings is not None: _update_property_value(properties, 'files', files) editor.data = json.dumps({ 'name': name, 'description': description, 'sessions': [{ 'type': editor_type, 'properties': properties, 'id': None }], 'selectedSnippet': editor_type, 'type': 'query-%s' % editor_type, 'showHistory': True, 'isSaved': is_saved, 'snippets': [{ 'status': status, 'id': str(uuid.uuid4()), 'statement_raw': statement, 'statement': statement, 'type': editor_type, 'properties': { 'files': [] if files is None else files, 'functions': [] if functions is None else functions, 'settings': [] if settings is None else settings }, 'name': name, 'database': database, 'result': {} }] }) return editor
def _get_notebook(user, notebook, operation_id): if operation_id and not notebook: nb_doc = Document2.objects.get_by_uuid(user=user, uuid=operation_id) notebook = Notebook(document=nb_doc).get_data() return notebook
def test_delete_notebook(self): trash_notebook_json = """ { "selectedSnippet": "hive", "showHistory": false, "description": "Test Hive Query", "name": "Test Hive Query", "sessions": [ { "type": "hive", "properties": [], "id": null } ], "type": "query-hive", "id": null, "snippets": [{"id": "e069ef32-5c95-4507-b961-e79c090b5abf","type":"hive","status":"ready","database":"default",""" \ """"statement":"select * from web_logs","statement_raw":"select * from web_logs","variables":[],"properties":""" \ """{"settings":[],"files":[],"functions":[]},"result":{}}], "uuid": "8a20da5f-b69c-4843-b17d-dea5c74c41d1" } """ # Assert that the notebook is first saved response = self.client.post(reverse('notebook:save_notebook'), {'notebook': trash_notebook_json}) data = json.loads(response.content) assert_equal(0, data['status'], data) # Test that deleting it moves it to the user's Trash folder notebook_doc = Document2.objects.get(id=data['id']) trash_notebooks = [Notebook(notebook_doc).get_data()] response = self.client.post(reverse('notebook:delete'), {'notebooks': json.dumps(trash_notebooks)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_equal('Trashed 1 notebook(s)', data['message'], data) response = self.client.get('/desktop/api2/doc', {'path': '/.Trash'}) data = json.loads(response.content) trash_uuids = [doc['uuid'] for doc in data['children']] assert_true(notebook_doc.uuid in trash_uuids, data) # Test that any errors are reported in the response nonexistant_doc = { "id": 12345, "uuid": "ea22da5f-b69c-4843-b17d-dea5c74c41d1", "selectedSnippet": "hive", "showHistory": False, "description": "Test Hive Query", "name": "Test Hive Query", "sessions": [{ "type": "hive", "properties": [], "id": None, }], "type": "query-hive", "snippets": [{ "id": "e069ef32-5c95-4507-b961-e79c090b5abf", "type": "hive", "status": "ready", "database": "default", "statement": "select * from web_logs", "statement_raw": "select * from web_logs", "variables": [], "properties": { "settings": [], "files": [], "functions": [] }, "result": {} }] } trash_notebooks = [nonexistant_doc] response = self.client.post(reverse('notebook:delete'), {'notebooks': json.dumps(trash_notebooks)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_equal( 'Trashed 0 notebook(s) and failed to delete 1 notebook(s).', data['message'], data) assert_equal(['ea22da5f-b69c-4843-b17d-dea5c74c41d1'], data['errors'])
def _small_indexing(user, fs, client, source, destination, index_name): unique_key_field = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None df = destination['indexerDefaultField'] and destination[ 'indexerDefaultField'][0] or None kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): stats = fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) skip_fields = [field['name'] for field in fields if not field['keep']] kwargs['fieldnames'] = ','.join([field['name'] for field in fields]) for field in fields: for operation in field['operations']: if operation['type'] == 'split': field[ 'multiValued'] = True # Solr requires multiValued to be set when splitting kwargs['f.%(name)s.split' % field] = 'true' kwargs['f.%(name)s.separator' % field] = operation['settings']['splitChar'] or ',' if skip_fields: kwargs['skip'] = ','.join(skip_fields) fields = [ field for field in fields if field['name'] not in skip_fields ] if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not destination['hasHeader']: kwargs['header'] = 'false' else: kwargs['skipLines'] = 1 if not client.exists(index_name): client.create_index( name=index_name, config_name=destination.get('indexerConfigSet'), fields=fields, unique_key_field=unique_key_field, df=df, shards=destination['indexerNumShards'], replication=destination['indexerReplicationFactor']) if source['inputFormat'] == 'file': data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' kwargs['map'] = 'NULL:' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get( 'id') else source['query'] notebook = Notebook(document=Document2.objects.document( user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [ field['name'] for field in fields if field['name'] != 'hue_id' ] fetch_handle = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over ) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning else: response = client.index(name=index_name, data=data, **kwargs) errors = [ error.get('message', '') for error in response['responseHeader'].get('errors', []) ] except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
def _get_query(self, name): nb_doc = Document2.objects.document(user=self.user, doc_id=name) notebook = Notebook(document=nb_doc).get_data() snippet = notebook['snippets'][0] return snippet['statement'].strip(';')
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib.unquote(file_format["path"]) stream = request.fs.open(path) encoding = chardet.detect(stream.read(10000)).get('encoding') stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception, e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, }
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format[ 'inputFormat'] == 'query': # Only support open query history # TODO get schema from explain query, which is not possible notebook = Notebook(document=Document2.objects.get( id=file_format['query'])).get_data() snippet = notebook['snippets'][0] sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True) format_ = { "sample": sample['rows'][:4], "sample_cols": sample.meta, "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample.meta ] } return JsonResponse(format_)
def get_history(request): response = {'status': -1} doc_type = request.GET.get('doc_type') doc_text = request.GET.get('doc_text') connector_id = request.GET.get('doc_connector') page = min(int(request.GET.get('page', 1)), 100) limit = min(int(request.GET.get('limit', 50)), 100) is_notification_manager = request.GET.get('is_notification_manager', 'false') == 'true' if is_notification_manager: docs = Document2.objects.get_tasks_history(user=request.user) else: docs = Document2.objects.get_history(doc_type='query-%s' % doc_type, connector_id=connector_id, user=request.user) if doc_text: docs = docs.filter( Q(name__icontains=doc_text) | Q(description__icontains=doc_text) | Q(search__icontains=doc_text)) # Paginate docs = docs.order_by('-last_modified') response['count'] = docs.count() docs = __paginate(page, limit, queryset=docs)['documents'] history = [] for doc in docs: notebook = Notebook(document=doc).get_data() if 'snippets' in notebook: statement = notebook[ 'description'] if is_notification_manager else _get_statement( notebook) history.append({ 'name': doc.name, 'id': doc.id, 'uuid': doc.uuid, 'type': doc.type, 'data': { 'statement': statement[:1001] if statement else '', 'lastExecuted': notebook['snippets'][0].get('lastExecuted', -1), 'status': notebook['snippets'][0]['status'], 'parentSavedQueryUuid': notebook.get('parentSavedQueryUuid', '') } if notebook['snippets'] else {}, 'absoluteUrl': doc.get_absolute_url(), }) else: LOG.error('Incomplete History Notebook: %s' % notebook) response['history'] = sorted(history, key=lambda row: row['data']['lastExecuted'], reverse=True) response['message'] = _('History fetched') response['status'] = 0 return JsonResponse(response)
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready', files=None, functions=None, settings=None, is_saved=False, database='default', snippet_properties=None, batch_submit=False, on_success_url=None, skip_historify=False, is_task=False, last_executed=-1, is_notebook=False, pub_sub_url=None, result_properties={}, namespace=None, compute=None, is_presentation_mode=False): ''' skip_historify: do not add the task to the query history. e.g. SQL Dashboard is_task / isManaged: true when being a managed by Hue operation (include_managed=True in document), e.g. exporting query result, dropping some tables ''' from notebook.connectors.hiveserver2 import HS2Api if has_connectors(): interpreter = get_interpreter(connector_type=editor_type) editor_connector = editor_type editor_type = interpreter['dialect'] else: editor_connector = editor_type editor = Notebook() if snippet_properties is None: snippet_properties = {} if editor_type == 'hive': sessions_properties = HS2Api.get_properties(editor_type) if files is not None: _update_property_value(sessions_properties, 'files', files) if functions is not None: _update_property_value(sessions_properties, 'functions', functions) if settings is not None: _update_property_value(sessions_properties, 'settings', settings) elif editor_type == 'impala': sessions_properties = HS2Api.get_properties(editor_type) if settings is not None: _update_property_value(sessions_properties, 'files', files) elif editor_type == 'java': sessions_properties = [] # Java options else: sessions_properties = [] data = { 'name': name, 'uuid': str(uuid.uuid4()), 'description': description, 'sessions': [{ 'type': editor_connector, 'properties': sessions_properties, 'id': None }], 'selectedSnippet': editor_connector, # TODO: might need update in notebook.ko.js 'type': 'notebook' if is_notebook else 'query-%s' % editor_type, 'showHistory': True, 'isSaved': is_saved, 'onSuccessUrl': urllib_quote(on_success_url.encode('utf-8'), safe=SAFE_CHARACTERS_URI) if on_success_url else None, 'pubSubUrl': pub_sub_url, 'skipHistorify': skip_historify, 'isPresentationModeDefault': is_presentation_mode, 'isManaged': is_task, 'snippets': [{ 'status': status, 'id': str(uuid.uuid4()), 'statement_raw': statement, 'statement': statement, 'type': editor_connector, 'wasBatchExecuted': batch_submit, 'lastExecuted': last_executed, 'properties': { 'files': [] if files is None else files, 'functions': [] if functions is None else functions, 'settings': [] if settings is None else settings }, 'name': name, 'database': database, 'namespace': namespace if namespace else {}, 'compute': compute if compute else {}, 'result': { 'handle': {} }, 'variables': [] }] if not is_notebook else [] } if snippet_properties: data['snippets'][0]['properties'].update(snippet_properties) if result_properties: data['snippets'][0]['result'].update(result_properties) editor.data = json.dumps(data) return editor
def get_default(self, user, name, engine='solr', source='data'): fields = self.fields_data(user, name, engine, source=source) id_field = [field['name'] for field in fields if field.get('isId')] if id_field: id_field = id_field[0] else: id_field = '' # Schemaless might not have an id if source == 'query': nb_doc = Document2.objects.document(user=user, doc_id=name) notebook = Notebook(document=nb_doc).get_data() label = _get_snippet_name(notebook, unique=True) else: label = name TEMPLATE = { "extracode": escape( "<style type=\"text/css\">\nem {\n font-weight: bold;\n background-color: yellow;\n}</style>\n\n<script>\n</script>" ), "highlighting": [""], "properties": { "highlighting_enabled": True }, "template": """ <div class="row-fluid"> <div class="row-fluid"> <div class="span12">%s</div> </div> <br/> </div>""" % ' '.join(['{{%s}}' % field['name'] for field in fields]), "isGridLayout": True, "showFieldList": True, "showGrid": True, "showChart": False, "chartSettings": { 'chartType': 'bars', 'chartSorting': 'none', 'chartScatterGroup': None, 'chartScatterSize': None, 'chartScope': 'world', 'chartX': None, 'chartYSingle': None, 'chartYMulti': [], 'chartData': [], 'chartMapLabel': None, }, "fieldsAttributes": [self._make_gridlayout_header_field(field) for field in fields], "fieldsSelected": [], "leafletmap": { 'latitudeField': None, 'longitudeField': None, 'labelField': None }, "rows": 25, } FACETS = [] return { 'id': None, 'name': name, 'engine': engine, 'source': source, 'label': label, 'enabled': False, 'template': TEMPLATE, 'facets': FACETS, 'fields': fields, 'idField': id_field, }
def _execute_notebook(request, notebook, snippet): response = {'status': -1} result = None history = None active_executable = None historify = (notebook['type'] != 'notebook' or snippet.get('wasBatchExecuted') ) and not notebook.get('skipHistorify') try: try: sessions = notebook.get('sessions') and notebook[ 'sessions'] # Session reference for snippet execution without persisting it active_executable = json.loads(request.POST.get( 'executable', '{}')) # Editor v2 # TODO: Use statement, database etc. from active_executable if historify: history = _historify(notebook, request.user) notebook = Notebook(document=history).get_data() interpreter = get_api(request, snippet) if snippet.get('interface') == 'sqlalchemy': interpreter.options['session'] = sessions[0] with opentracing.tracer.start_span('interpreter') as span: # interpreter.execute needs the sessions, but we don't want to persist them pre_execute_sessions = notebook['sessions'] notebook['sessions'] = sessions response['handle'] = interpreter.execute(notebook, snippet) notebook['sessions'] = pre_execute_sessions # Retrieve and remove the result from the handle if response['handle'].get('sync'): result = response['handle'].pop('result') finally: if historify: _snippet = [ s for s in notebook['snippets'] if s['id'] == snippet['id'] ][0] if 'id' in active_executable: # Editor v2 # notebook_executable is the 1-to-1 match of active_executable in the notebook structure notebook_executable = [ e for e in _snippet['executor']['executables'] if e['id'] == active_executable['id'] ][0] if 'handle' in response: notebook_executable['handle'] = response['handle'] if history: notebook_executable['history'] = { 'id': history.id, 'uuid': history.uuid } notebook_executable['operationId'] = history.uuid if 'handle' in response: # No failure if 'result' not in _snippet: # Editor v2 _snippet['result'] = {} _snippet['result']['handle'] = response['handle'] _snippet['result']['statements_count'] = response[ 'handle'].get('statements_count', 1) _snippet['result']['statement_id'] = response[ 'handle'].get('statement_id', 0) _snippet['result']['handle']['statement'] = response[ 'handle'].get('statement', snippet['statement']).strip( ) # For non HS2, as non multi query yet else: _snippet['status'] = 'failed' if history: # If _historify failed, history will be None. # If we get Atomic block exception, something underneath interpreter.execute() crashed and is not handled. history.update_data(notebook) history.save() response['history_id'] = history.id response['history_uuid'] = history.uuid if notebook[ 'isSaved']: # Keep track of history of saved queries response[ 'history_parent_uuid'] = history.dependencies.filter( type__startswith='query-').latest( 'last_modified').uuid except QueryError as ex: # We inject the history information from _historify() to the failed queries if response.get('history_id'): ex.extra['history_id'] = response['history_id'] if response.get('history_uuid'): ex.extra['history_uuid'] = response['history_uuid'] if response.get('history_parent_uuid'): ex.extra['history_parent_uuid'] = response['history_parent_uuid'] raise ex # Inject and HTML escape results if result is not None: response['result'] = result response['result']['data'] = escape_rows(result['data']) response['status'] = 0 return response
def _execute_notebook(request, notebook, snippet): response = {'status': -1} result = None history = None historify = (notebook['type'] != 'notebook' or snippet.get('wasBatchExecuted') ) and not notebook.get('skipHistorify') try: try: session = notebook.get('sessions') and notebook['sessions'][ 0] # Session reference for snippet execution without persisting it if historify: history = _historify(notebook, request.user) notebook = Notebook(document=history).get_data() interpreter = get_api(request, snippet) if snippet.get('interface') == 'sqlalchemy': interpreter.options['session'] = session response['handle'] = interpreter.execute(notebook, snippet) # Retrieve and remove the result from the handle if response['handle'].get('sync'): result = response['handle'].pop('result') finally: if historify: _snippet = [ s for s in notebook['snippets'] if s['id'] == snippet['id'] ][0] if 'handle' in response: # No failure _snippet['result']['handle'] = response['handle'] _snippet['result']['statements_count'] = response[ 'handle'].get('statements_count', 1) _snippet['result']['statement_id'] = response[ 'handle'].get('statement_id', 0) _snippet['result']['handle']['statement'] = response[ 'handle'].get('statement', snippet['statement']).strip( ) # For non HS2, as non multi query yet else: _snippet['status'] = 'failed' if history: # If _historify failed, history will be None history.update_data(notebook) history.save() response['history_id'] = history.id response['history_uuid'] = history.uuid if notebook[ 'isSaved']: # Keep track of history of saved queries response[ 'history_parent_uuid'] = history.dependencies.filter( type__startswith='query-').latest( 'last_modified').uuid except QueryError, ex: # We inject the history information from _historify() to the failed queries if response.get('history_id'): ex.extra['history_id'] = response['history_id'] if response.get('history_uuid'): ex.extra['history_uuid'] = response['history_uuid'] if response.get('history_parent_uuid'): ex.extra['history_parent_uuid'] = response['history_parent_uuid'] raise ex
def _get_snippet(user, notebook, snippet, operation_id): if operation_id or not snippet: nb_doc = Document2.objects.get_by_uuid(user=user, uuid=operation_id or notebook['uuid']) notebook = Notebook(document=nb_doc).get_data() snippet = notebook['snippets'][0] return snippet
def export_documents(request): if request.GET.get('documents'): selection = json.loads(request.GET.get('documents')) else: selection = json.loads(request.POST.get('documents')) include_history = request.GET.get('history', 'false') == 'true' # Only export documents the user has permissions to read docs = Document2.objects.documents(user=request.user, perms='both', include_history=True, include_trashed=True).\ filter(id__in=selection).order_by('-id') # Add any dependencies to the set of exported documents export_doc_set = _get_dependencies(docs, include_history=include_history) # For directories, add any children docs to the set of exported documents export_doc_set.update(_get_dependencies(docs, deps_mode=False)) # Get PKs of documents to export doc_ids = [doc.pk for doc in export_doc_set] num_docs = len(doc_ids) if len(selection) == 1 and num_docs >= len(selection) and docs[0].name: filename = docs[0].name else: filename = 'hue-documents-%s-(%s)' % ( datetime.today().strftime('%Y-%m-%d'), num_docs) f = string_io() if doc_ids: doc_ids = ','.join(map(str, doc_ids)) management.call_command('dumpdata', 'desktop.Document2', primary_keys=doc_ids, indent=2, use_natural_foreign_keys=True, verbosity=2, stdout=f) if request.GET.get('format') == 'json': return JsonResponse(f.getvalue(), safe=False) elif request.GET.get('format') == 'zip': zfile = zipfile.ZipFile(f, 'w') zfile.writestr("hue.json", f.getvalue()) for doc in docs: if doc.type == 'notebook': try: from spark.models import Notebook zfile.writestr("notebook-%s-%s.txt" % (doc.name, doc.id), smart_str(Notebook(document=doc).get_str())) except Exception as e: LOG.exception(e) zfile.close() response = HttpResponse(content_type="application/zip") response["Content-Length"] = len(f.getvalue()) response[ 'Content-Disposition'] = b'attachment; filename="%s".zip' % filename response.write(f.getvalue()) return response else: return make_response(f.getvalue(), 'json', filename)
def get_document(request): """ Returns the document or directory found for the given uuid or path and current user. If a directory is found, return any children documents too. Optional params: page=<n> - Controls pagination. Defaults to 1. limit=<n> - Controls limit per page. Defaults to all. type=<type> - Show documents of given type(s) (directory, query-hive, query-impala, query-mysql, etc). Default to all. sort=<key> - Sort by the attribute <key>, which is one of: "name", "type", "owner", "last_modified" Accepts the form "-last_modified", which sorts in descending order. Default to "-last_modified". text=<frag> - Search for fragment "frag" in names and descriptions. data=<false|true> - Return all the data of the document. Default to false. dependencies=<false|true> - Return all the dependencies and dependents of the document. Default to false. """ path = request.GET.get('path', '/') uuid = request.GET.get('uuid') with_data = request.GET.get('data', 'false').lower() == 'true' with_dependencies = request.GET.get('dependencies', 'false').lower() == 'true' if uuid: if uuid.isdigit(): document = Document2.objects.document(user=request.user, doc_id=uuid) else: document = Document2.objects.get_by_uuid(user=request.user, uuid=uuid) else: # Find by path document = Document2.objects.get_by_path(user=request.user, path=path) response = { 'document': document.to_dict(), 'parent': document.parent_directory.to_dict() if document.parent_directory else None, 'children': [], 'dependencies': [], 'dependents': [], 'data': '', 'status': 0 } response['user_perms'] = { 'can_read': document.can_read(request.user), 'can_write': document.can_write(request.user) } if with_data: data = json.loads(document.data) # Upgrade session properties for Hive and Impala if document.type.startswith('query'): notebook = Notebook(document=document) notebook = upgrade_session_properties(request, notebook) data = json.loads(notebook.data) if data.get('uuid') != document.uuid: # Old format < 3.11 data['uuid'] = document.uuid response['data'] = data if with_dependencies: response['dependencies'] = [dependency.to_dict() for dependency in document.dependencies.all()] response['dependents'] = [dependent.to_dict() for dependent in document.dependents.all()] # Get children documents if this is a directory if document.is_directory: directory = Directory.objects.get(id=document.id) # If this is the user's home directory, fetch shared docs too if document.is_home_directory: children = directory.get_children_and_shared_documents(user=request.user) else: children = directory.get_children_documents() # Filter and order results response.update(_filter_documents(request, queryset=children, flatten=False)) # Paginate and serialize Results if 'documents' in response: response.update(_paginate(request, queryset=response['documents'])) # Rename documents to children response['children'] = response.pop('documents') response['children'] = [doc.to_dict() for doc in response['children']] return JsonResponse(response)
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready', files=None, functions=None, settings=None, is_saved=False, database='default', snippet_properties=None, batch_submit=False, on_success_url=None): from notebook.connectors.hiveserver2 import HS2Api editor = Notebook() if snippet_properties is None: snippet_properties = {} if editor_type == 'hive': sessions_properties = HS2Api.get_properties(editor_type) if files is not None: _update_property_value(sessions_properties, 'files', files) if functions is not None: _update_property_value(sessions_properties, 'functions', functions) if settings is not None: _update_property_value(sessions_properties, 'settings', settings) elif editor_type == 'impala': sessions_properties = HS2Api.get_properties(editor_type) if settings is not None: _update_property_value(sessions_properties, 'files', files) elif editor_type == 'java': sessions_properties = [] # Java options else: sessions_properties = [] data = { 'name': name, 'uuid': str(uuid.uuid4()), 'description': description, 'sessions': [ { 'type': editor_type, 'properties': sessions_properties, 'id': None } ], 'selectedSnippet': editor_type, 'type': 'query-%s' % editor_type, 'showHistory': True, 'isSaved': is_saved, 'onSuccessUrl': on_success_url, 'snippets': [ { 'status': status, 'id': str(uuid.uuid4()), 'statement_raw': statement, 'statement': statement, 'type': editor_type, 'wasBatchExecuted': batch_submit, 'properties': { 'files': [] if files is None else files, 'functions': [] if functions is None else functions, 'settings': [] if settings is None else settings }, 'name': name, 'database': database, 'result': {'handle':{}}, 'variables': [] } ] } if snippet_properties: data['snippets'][0]['properties'].update(snippet_properties) editor.data = json.dumps(data) return editor
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) stream = request.fs.open(path) encoding = chardet.detect(stream.read(10000)).get('encoding') stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': if file_format.get( 'kafkaSelectedTopics') == 'NavigatorAuditEvents': kafkaFieldNames = [ 'id', 'additionalInfo', 'allowed', 'collectionName', 'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst', 'entityId', 'family', 'impersonator', 'ip', 'name', 'objectType', 'objType', 'objUsageType', 'operationParams', 'operationText', 'op', 'opText', 'path', 'perms', 'privilege', 'qualifier', 'QUERY_ID', 'resourcePath', 'service', 'SESSION_ID', 'solrVersion', 'src', 'status', 'subOperation', 'tableName', 'table', 'time', 'type', 'url', 'user' ] kafkaFieldTypes = ['string'] * len(kafkaFieldNames) kafkaFieldNames.append('timeDate') kafkaFieldTypes.append('date') else: # Note: mocked here, should come from SFDC or Kafka API or sampling job kafkaFieldNames = file_format.get('kafkaFieldNames', '').split(',') kafkaFieldTypes = file_format.get('kafkaFieldTypes', '').split(',') data = """%(kafkaFieldNames)s %(data)s""" % { 'kafkaFieldNames': ','.join(kafkaFieldNames), 'data': '\n'.join( [','.join(['...'] * len(kafkaFieldTypes))] * 5) } stream = string_io() stream.write(data) _convert_format(file_format["format"], inverse=True) indexer = MorphlineIndexer(request.user, request.fs) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) type_mapping = dict(list(zip(kafkaFieldNames, kafkaFieldTypes))) for col in format_['columns']: col['keyType'] = type_mapping[col['name']] col['type'] = type_mapping[col['name']] elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception, e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, }
def make_notebook(name='Browse', description='', editor_type='hive', statement='', status='ready', files=None, functions=None, settings=None, is_saved=False, database='default', snippet_properties=None, batch_submit=False): from notebook.connectors.hiveserver2 import HS2Api editor = Notebook() if snippet_properties is None: snippet_properties = {} if editor_type == 'hive': sessions_properties = HS2Api.get_properties(editor_type) if files is not None: _update_property_value(sessions_properties, 'files', files) if functions is not None: _update_property_value(sessions_properties, 'functions', functions) if settings is not None: _update_property_value(sessions_properties, 'settings', settings) elif editor_type == 'impala': sessions_properties = HS2Api.get_properties(editor_type) if settings is not None: _update_property_value(sessions_properties, 'files', files) elif editor_type == 'java': sessions_properties = [] # Java options else: sessions_properties = [] data = { 'name': name, 'uuid': str(uuid.uuid4()), 'description': description, 'sessions': [{ 'type': editor_type, 'properties': sessions_properties, 'id': None }], 'selectedSnippet': editor_type, 'type': 'query-%s' % editor_type, 'showHistory': True, 'isSaved': is_saved, 'snippets': [{ 'status': status, 'id': str(uuid.uuid4()), 'statement_raw': statement, 'statement': statement, 'type': editor_type, 'wasBatchExecuted': batch_submit, 'properties': { 'files': [] if files is None else files, 'functions': [] if functions is None else functions, 'settings': [] if settings is None else settings }, 'name': name, 'database': database, 'result': {}, 'variables': [] }] } if snippet_properties: data['snippets'][0]['properties'].update(snippet_properties) editor.data = json.dumps(data) return editor
def _small_indexing(user, fs, client, source, destination, index_name): kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): path = urllib.unquote(source["path"]) stats = fs.stats(path) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) _create_solr_collection(user, fs, client, destination, index_name, kwargs) if source['inputFormat'] == 'file': path = urllib.unquote(source["path"]) data = fs.read(path, 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' kwargs['map'] = 'NULL:' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get( 'id') else source['query'] notebook = Notebook(document=Document2.objects.document( user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [ field['name'] for field in fields if field['name'] != 'hue_id' ] fetch_handle = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over ) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning elif source['inputFormat'] == 'manual': pass # No need to do anything else: response = client.index(name=index_name, data=data, **kwargs) errors = [ error.get('message', '') for error in response['responseHeader'].get('errors', []) ] except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': data = get_topic_data(request.user, file_format.get('kafkaSelectedTopics')) kafkaFieldNames = [col['name'] for col in data['full_headers']] kafkaFieldTypes = [col['type'] for col in data['full_headers']] topics_data = data['rows'] format_ = { "sample": topics_data, "columns": [ Field(col, 'string', unique=False).to_dict() for col in kafkaFieldNames ] } # data = """%(kafkaFieldNames)s # %(data)s""" % { # 'kafkaFieldNames': ','.join(kafkaFieldNames), # 'data': '\n'.join([','.join(cols) for cols in topics_data]) # } # stream = string_io() # stream.write(data) # _convert_format(file_format["format"], inverse=True) # indexer = MorphlineIndexer(request.user, request.fs) # format_ = indexer.guess_field_types({ # "file": { # "stream": stream, # "name": file_format['path'] # }, # "format": file_format['format'] # }) # type_mapping = dict( # list( # zip(kafkaFieldNames, kafkaFieldTypes) # ) # ) # for col in format_['columns']: # col['keyType'] = type_mapping[col['name']] # col['type'] = type_mapping[col['name']] elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)