def action_parameters(request): response = {'status': -1} parameters = set() try: node_data = json.loads(request.POST.get('node', '{}')) parameters = parameters.union(set(Node(node_data).find_parameters())) script_path = node_data.get('properties', {}).get('script_path', {}) if script_path: script_path = script_path.replace('hdfs://', '') if request.fs.do_as_user(request.user, request.fs.exists, script_path): data = request.fs.do_as_user(request.user, request.fs.read, script_path, 0, 16 * 1024 ** 2) if node_data['type'] in ('hive', 'hive2'): parameters = parameters.union(set(find_dollar_braced_variables(data))) elif node_data['type'] == 'pig': parameters = parameters.union(set(find_dollar_variables(data))) elif node_data['type'] == 'hive-document': notebook = Notebook(document=Document2.objects.get_by_uuid(user=request.user, uuid=node_data['properties']['uuid'])) parameters = parameters.union(set(find_dollar_braced_variables(notebook.get_str()))) response['status'] = 0 response['parameters'] = list(parameters) except Exception, e: response['message'] = str(e)
def execute_and_wait(self, query_doc, snippet_idx=0, timeout=30.0, wait=1.0): notebook = Notebook(document=query_doc) snippet = self.get_snippet(notebook, snippet_idx=snippet_idx) curr = time.time() end = curr + timeout status = 'ready' response = self.client.post(reverse('notebook:execute'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) notebook = Notebook(document=query_doc) snippet = self.get_snippet(notebook, snippet_idx=snippet_idx) data = json.loads(response.content) snippet['result']['handle'] = data['handle'] while status != 'available' and curr <= end: response = self.client.post(reverse('notebook:check_status'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) status = data['query_status']['status'] snippet['status'] = status time.sleep(wait) curr = time.time() if status != 'available': raise Exception('Query failed to complete or return results.') return snippet
def test_fetch_result_abbreviated(self): if not is_live_cluster(): raise SkipTest # Create session so that session object is saved to DB for server URL lookup session = self.api.create_session(lang='impala') try: # Assert that abbreviated rows returned (e.g. - 1.00K) still returns actual rows statement = "SELECT * FROM web_logs;" doc = self.create_query_document(owner=self.user, query_type='impala', statement=statement) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) self.client.post(reverse('notebook:fetch_result_data'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet), 'rows': 100, 'startOver': 'false'}) response = self.client.post(reverse('notebook:fetch_result_size'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_equal(1000, data['result']['rows']) finally: self.api.close_session(session)
def test_get_current_statement(self): multi_statement = "SELECT description, salary FROM sample_07 LIMIT 20;\r\nSELECT AVG(salary) FROM sample_07;" doc = self.create_query_document(owner=self.user, statement=multi_statement) notebook = Notebook(document=doc) snippet = self.get_snippet(notebook, snippet_idx=0) response = self.client.post(reverse('notebook:execute'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_equal(0, data['handle']['statement_id'], data) assert_equal(2, data['handle']['statements_count'], data) assert_equal(True, data['handle']['has_more_statements'], data) assert_equal({'row': 0, 'column': 0}, data['handle']['start'], data) assert_equal({'row': 0, 'column': 51}, data['handle']['end'], data) snippet['result']['handle'] = data['handle'] response = self.client.post(reverse('notebook:execute'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_equal(1, data['handle']['statement_id'], data) assert_equal(2, data['handle']['statements_count'], data) assert_equal(False, data['handle']['has_more_statements'], data) assert_equal({'row': 1, 'column': 0}, data['handle']['start'], data) assert_equal({'row': 1, 'column': 33}, data['handle']['end'], data)
def test_get_sample(self): doc = self.create_query_document(owner=self.user, statement=self.statement) notebook = Notebook(document=doc) snippet = self.get_snippet(notebook, snippet_idx=0) response = self.client.post(reverse('notebook:api_sample_data', kwargs={'database': 'default', 'table': 'sample_07'}), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('headers' in data) assert_true('rows' in data) assert_true(len(data['rows']) > 0) response = self.client.post(reverse('notebook:api_sample_data_column', kwargs={'database': 'default', 'table': 'sample_07', 'column': 'code'}), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('headers' in data) assert_equal(['code'], data['headers']) assert_true('rows' in data) assert_true(len(data['rows']) > 0)
def test_get_current_statement(self): multi_statement = "SELECT description, salary FROM sample_07 LIMIT 20;\r\nSELECT AVG(salary) FROM sample_07;" doc = self.create_query_document(owner=self.user, statement=multi_statement) notebook = Notebook(document=doc) snippet = self.get_snippet(notebook, snippet_idx=0) response = self.client.post(reverse('notebook:execute'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_equal(0, data['handle']['statement_id'], data) assert_equal(2, data['handle']['statements_count'], data) assert_equal(True, data['handle']['has_more_statements'], data) assert_equal({'row': 0, 'column': 0}, data['handle']['start'], data) assert_equal({'row': 0, 'column': 51}, data['handle']['end'], data) snippet['result']['handle'] = data['handle'] response = self.client.post(reverse('notebook:execute'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_equal(1, data['handle']['statement_id'], data) assert_equal(2, data['handle']['statements_count'], data) assert_equal(False, data['handle']['has_more_statements'], data) assert_equal({'row': 1, 'column': 0}, data['handle']['start'], data) assert_equal({'row': 1, 'column': 33}, data['handle']['end'], data)
def schedule_document(request): if request.method != 'POST': raise PopupException(_('A POST request is required.')) uuid = request.POST.get('uuid') document = Document2.objects.get_by_uuid(user=request.user, uuid=uuid) notebook = Notebook(document=document) parameters = find_dollar_braced_variables(notebook.get_str()) name = _('Schedule of ') + document.name params = [{u'value': u'%s=${%s}' % (p, p)} for p in parameters] data = json.dumps({ 'workflow': {u'name': name, u'versions': [u'uri:oozie:workflow:0.4', u'uri:oozie:workflow:0.4.5', u'uri:oozie:workflow:0.5'], u'isDirty': False, u'movedNode': None, u'linkMapping': {u'33430f0f-ebfa-c3ec-f237-3e77efa03d0a': [], u'3f107997-04cc-8733-60a9-a4bb62cebffc': [u'0aec471d-2b7c-d93d-b22c-2110fd17ea2c'], u'0aec471d-2b7c-d93d-b22c-2110fd17ea2c': [u'33430f0f-ebfa-c3ec-f237-3e77efa03d0a'], u'17c9c895-5a16-7443-bb81-f34b30b21548': []}, u'nodeIds': [u'3f107997-04cc-8733-60a9-a4bb62cebffc', u'33430f0f-ebfa-c3ec-f237-3e77efa03d0a', u'17c9c895-5a16-7443-bb81-f34b30b21548', u'0aec471d-2b7c-d93d-b22c-2110fd17ea2c'], u'id': 47, u'nodes': [{u'name': u'Start', u'properties': {}, u'actionParametersFetched': False, u'id': u'3f107997-04cc-8733-60a9-a4bb62cebffc', u'type': u'start-widget', u'children': [{u'to': u'0aec471d-2b7c-d93d-b22c-2110fd17ea2c'}], u'actionParameters': []}, {u'name': u'End', u'properties': {}, u'actionParametersFetched': False, u'id': u'33430f0f-ebfa-c3ec-f237-3e77efa03d0a', u'type': u'end-widget', u'children': [], u'actionParameters': []}, {u'name': u'Kill', u'properties': {u'body': u'', u'cc': u'', u'to': u'', u'enableMail': False, u'message': u'Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]', u'subject': u''}, u'actionParametersFetched': False, u'id': u'17c9c895-5a16-7443-bb81-f34b30b21548', u'type': u'kill-widget', u'children': [], u'actionParameters': []}, {u'name': u'hive-0aec', u'actionParametersUI': [], u'properties': {u'files': [], u'job_xml': u'', u'uuid': uuid, u'parameters': params, u'retry_interval': [], u'retry_max': [], u'job_properties': [], u'sla': [{u'key': u'enabled', u'value': False}, {u'key': u'nominal-time', u'value': u'${nominal_time}'}, {u'key': u'should-start', u'value': u''}, {u'key': u'should-end', u'value': u'${30 * MINUTES}'}, {u'key': u'max-duration', u'value': u''}, {u'key': u'alert-events', u'value': u''}, {u'key': u'alert-contact', u'value': u''}, {u'key': u'notification-msg', u'value': u''}, {u'key': u'upstream-apps', u'value': u''}], u'archives': [], u'prepares': [], u'credentials': [], u'password': u'', u'jdbc_url': u''}, u'actionParametersFetched': False, u'id': u'0aec471d-2b7c-d93d-b22c-2110fd17ea2c', u'type': u'hive-document-widget', u'children': [{u'to': u'33430f0f-ebfa-c3ec-f237-3e77efa03d0a'}, {u'error': u'17c9c895-5a16-7443-bb81-f34b30b21548'}], u'actionParameters': []}], u'properties': {u'job_xml': u'', u'description': u'', u'wf1_id': None, u'sla_enabled': False, u'deployment_dir': u'/user/hue/oozie/workspaces/hue-oozie-1459474214.27', u'schema_version': u'uri:oozie:workflow:0.5', u'sla': [{u'key': u'enabled', u'value': False}, {u'key': u'nominal-time', u'value': u'${nominal_time}'}, {u'key': u'should-start', u'value': u''}, {u'key': u'should-end', u'value': u'${30 * MINUTES}'}, {u'key': u'max-duration', u'value': u''}, {u'key': u'alert-events', u'value': u''}, {u'key': u'alert-contact', u'value': u''}, {u'key': u'notification-msg', u'value': u''}, {u'key': u'upstream-apps', u'value': u''}], u'show_arrows': True, u'parameters': [{u'name': u'oozie.use.system.libpath', u'value': True}], u'properties': []}, u'nodeNamesMapping': {u'33430f0f-ebfa-c3ec-f237-3e77efa03d0a': u'End', u'3f107997-04cc-8733-60a9-a4bb62cebffc': u'Start', u'0aec471d-2b7c-d93d-b22c-2110fd17ea2c': u'hive-0aec', u'17c9c895-5a16-7443-bb81-f34b30b21548': u'Kill'}, u'uuid': u'433922e5-e616-dfe0-1cba-7fe744c9305c'}, 'layout': [{u'oozieRows': [{u'enableOozieDropOnBefore': True, u'enableOozieDropOnSide': True, u'enableOozieDrop': False, u'widgets': [{u'status': u'', u'logsURL': u'', u'name': u'Hive', u'widgetType': u'hive-document-widget', u'oozieMovable': True, u'ooziePropertiesExpanded': False, u'externalIdUrl': u'', u'properties': {}, u'isLoading': True, u'offset': 0, u'actionURL': u'', u'progress': 0, u'klass': u'card card-widget span12', u'oozieExpanded': False, u'id': u'0aec471d-2b7c-d93d-b22c-2110fd17ea2c', u'size': 12}], u'id': u'32e1ea1a-812b-6878-9719-ff7b8407bf46', u'columns': []}], u'rows': [{u'enableOozieDropOnBefore': True, u'enableOozieDropOnSide': True, u'enableOozieDrop': False, u'widgets': [{u'status': u'', u'logsURL': u'', u'name': u'Start', u'widgetType': u'start-widget', u'oozieMovable': False, u'ooziePropertiesExpanded': False, u'externalIdUrl': u'', u'properties': {}, u'isLoading': True, u'offset': 0, u'actionURL': u'', u'progress': 0, u'klass': u'card card-widget span12', u'oozieExpanded': False, u'id': u'3f107997-04cc-8733-60a9-a4bb62cebffc', u'size': 12}], u'id': u'798dc16a-d366-6305-d2b3-2d5a6f6c4f4b', u'columns': []}, {u'enableOozieDropOnBefore': True, u'enableOozieDropOnSide': True, u'enableOozieDrop': False, u'widgets': [{u'status': u'', u'logsURL': u'', u'name': u'Hive', u'widgetType': u'hive-document-widget', u'oozieMovable': True, u'ooziePropertiesExpanded': False, u'externalIdUrl': u'', u'properties': {}, u'isLoading': True, u'offset': 0, u'actionURL': u'', u'progress': 0, u'klass': u'card card-widget span12', u'oozieExpanded': False, u'id': u'0aec471d-2b7c-d93d-b22c-2110fd17ea2c', u'size': 12}], u'id': u'32e1ea1a-812b-6878-9719-ff7b8407bf46', u'columns': []}, {u'enableOozieDropOnBefore': True, u'enableOozieDropOnSide': True, u'enableOozieDrop': False, u'widgets': [{u'status': u'', u'logsURL': u'', u'name': u'End', u'widgetType': u'end-widget', u'oozieMovable': False, u'ooziePropertiesExpanded': False, u'externalIdUrl': u'', u'properties': {}, u'isLoading': True, u'offset': 0, u'actionURL': u'', u'progress': 0, u'klass': u'card card-widget span12', u'oozieExpanded': False, u'id': u'33430f0f-ebfa-c3ec-f237-3e77efa03d0a', u'size': 12}], u'id': u'f2cf152d-8c82-2f4f-5d67-2e18c99e59c4', u'columns': []}, {u'enableOozieDropOnBefore': True, u'enableOozieDropOnSide': True, u'enableOozieDrop': False, u'widgets': [{u'status': u'', u'logsURL': u'', u'name': u'Kill', u'widgetType': u'kill-widget', u'oozieMovable': True, u'ooziePropertiesExpanded': False, u'externalIdUrl': u'', u'properties': {}, u'isLoading': True, u'offset': 0, u'actionURL': u'', u'progress': 0, u'klass': u'card card-widget span12', u'oozieExpanded': False, u'id': u'17c9c895-5a16-7443-bb81-f34b30b21548', u'size': 12}], u'id': u'01afcf1b-fa7a-e093-b613-ce52c5531a04', u'columns': []}], u'oozieEndRow': {u'enableOozieDropOnBefore': True, u'enableOozieDropOnSide': True, u'enableOozieDrop': False, u'widgets': [{u'status': u'', u'logsURL': u'', u'name': u'End', u'widgetType': u'end-widget', u'oozieMovable': False, u'ooziePropertiesExpanded': False, u'externalIdUrl': u'', u'properties': {}, u'isLoading': True, u'offset': 0, u'actionURL': u'', u'progress': 0, u'klass': u'card card-widget span12', u'oozieExpanded': False, u'id': u'33430f0f-ebfa-c3ec-f237-3e77efa03d0a', u'size': 12}], u'id': u'f2cf152d-8c82-2f4f-5d67-2e18c99e59c4', u'columns': []}, u'oozieKillRow': {u'enableOozieDropOnBefore': True, u'enableOozieDropOnSide': True, u'enableOozieDrop': False, u'widgets': [{u'status': u'', u'logsURL': u'', u'name': u'Kill', u'widgetType': u'kill-widget', u'oozieMovable': True, u'ooziePropertiesExpanded': False, u'externalIdUrl': u'', u'properties': {}, u'isLoading': True, u'offset': 0, u'actionURL': u'', u'progress': 0, u'klass': u'card card-widget span12', u'oozieExpanded': False, u'id': u'17c9c895-5a16-7443-bb81-f34b30b21548', u'size': 12}], u'id': u'01afcf1b-fa7a-e093-b613-ce52c5531a04', u'columns': []}, u'enableOozieDropOnAfter': True, u'oozieStartRow': {u'enableOozieDropOnBefore': True, u'enableOozieDropOnSide': True, u'enableOozieDrop': False, u'widgets': [{u'status': u'', u'logsURL': u'', u'name': u'Start', u'widgetType': u'start-widget', u'oozieMovable': False, u'ooziePropertiesExpanded': False, u'externalIdUrl': u'', u'properties': {}, u'isLoading': True, u'offset': 0, u'actionURL': u'', u'progress': 0, u'klass': u'card card-widget span12', u'oozieExpanded': False, u'id': u'3f107997-04cc-8733-60a9-a4bb62cebffc', u'size': 12}], u'id': u'798dc16a-d366-6305-d2b3-2d5a6f6c4f4b', u'columns': []}, u'klass': u'card card-home card-column span12', u'enableOozieDropOnBefore': True, u'drops': [u'temp'], u'id': u'672ff75a-d841-72c3-c616-c9d45ec97649', u'size': 12}] }) workflow_doc = Document2.objects.create(name=name, type='oozie-workflow2', owner=request.user, data=data) Document.objects.link(workflow_doc, owner=workflow_doc.owner, name=workflow_doc.name, description=workflow_doc.description, extra='workflow2') workflow_doc.dependencies.add(document) response = { 'status': 0, 'url': reverse('oozie:new_coordinator') + '?workflow=' + workflow_doc.uuid } return JsonResponse(response)
def run_morphline(self, request, collection_name, morphline, input_path, query=None): workspace_path = self._upload_workspace(morphline) notebook = Notebook(name='Indexer job for %s' % collection_name, isManaged=True) if query: q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query)) notebook_data = q.get_data() snippet = notebook_data['snippets'][0] api = get_api(request, snippet) destination = '__hue_%s' % notebook_data['uuid'][:4] location = '/user/%s/__hue-%s' % (request.user, notebook_data['uuid'][:4]) sql, success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location) input_path = '${nameNode}%s' % location notebook.add_hive_snippet(snippet['database'], sql) notebook.add_java_snippet( clazz='org.apache.solr.hadoop.MapReduceIndexerTool', app_jar=CONFIG_INDEXER_LIBS_PATH.get(), arguments=[ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, input_path, ], files=[{ u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file' }, { u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file' }]) return notebook.execute(request, batch=True)
def run_morphline(self, request, collection_name, morphline, input_path, query=None, start_time=None, lib_path=None): workspace_path = self._upload_workspace(morphline) task = make_notebook( name=_('Indexing into %s') % collection_name, editor_type='notebook', on_success_url=reverse('search:browse', kwargs={'name': collection_name}), pub_sub_url='assist.collections.refresh', is_task=True, is_notebook=True, last_executed=start_time ) if query: q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query)) notebook_data = q.get_data() snippet = notebook_data['snippets'][0] api = get_api(request, snippet) destination = '__hue_%s' % notebook_data['uuid'][:4] location = '/user/%s/__hue-%s' % (request.user, notebook_data['uuid'][:4]) sql, _success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location) input_path = '${nameNode}%s' % location task.add_hive_snippet(snippet['database'], sql) client = SolrClient(self.user) extra_args = ['-Dmapreduce.job.user.classpath.first=true'] if client.is_solr_six_or_more() else [] task.add_java_snippet( clazz='org.apache.solr.hadoop.MapReduceIndexerTool', app_jar=lib_path if lib_path is not None else CONFIG_INDEXER_LIBS_PATH.get(), arguments=extra_args + [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', client.get_zookeeper_host(), u'--collection', collection_name, input_path, ], files=[ {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} ] ) return task.execute(request, batch=True)
def test_download(self): statement = "SELECT 'hello world';" doc = self.create_query_document(owner=self.user, statement=statement) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0) response = self.client.post(reverse('notebook:download'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet), 'format': 'csv'}) assert_equal(200, response.status_code) assert_equal(('Content-Disposition', 'attachment; filename=Test Query.csv'), response._headers['content-disposition'])
def apps(self, filters): tasks = Document2.objects.get_history( user=self.user).order_by('-last_modified')[:MAX_JOB_FETCH.get()] apps = [] for app in tasks: # Copied, Document class should have a get_history method (via method or inheritance) notebook = Notebook(document=app).get_data() is_notification_manager = False # Supposed SQL Editor query only right now if 'snippets' in notebook: statement = notebook[ 'description'] if is_notification_manager else _get_statement( notebook) history = { 'name': app.name, 'id': app.id, 'uuid': app.uuid, 'type': app.type, 'data': { 'statement': statement[:1001] if statement else '', 'lastExecuted': notebook['snippets'][0].get('lastExecuted', -1), 'status': notebook['snippets'][0]['status'], 'parentSavedQueryUuid': notebook.get('parentSavedQueryUuid', '') } if notebook['snippets'] else {}, 'absoluteUrl': app.get_absolute_url(), } api_status = self._api_status(history) if filters.get( 'states') and api_status.lower() not in filters['states']: continue apps.append({ 'id': 'history-%010d' % history['id'], 'name': history['data']['statement'], 'status': history['data']['status'], 'apiStatus': api_status, 'type': 'history-%s' % history['type'], 'user': self.user.username, 'progress': 50, 'queue': '', 'canWrite': True, 'duration': 1, 'submitted': history['data']['lastExecuted'] }) return {'apps': apps, 'total': len(tasks)}
def upload_query(request): response = {'status': -1} source_platform = request.POST.get('sourcePlatform', 'default') if OPTIMIZER.AUTO_UPLOAD_QUERIES.get() and source_platform in ('hive', 'impala'): query_id = request.POST.get('query_id') doc = Document2.objects.document(request.user, doc_id=query_id) query_data = Notebook(document=doc).get_data() queries = _convert_queries([query_data]) source_platform = query_data['snippets'][0]['type'] api = OptimizerApi(request.user) response['query_upload'] = api.upload(data=queries, data_type='queries', source_platform=source_platform) else: response['query_upload'] = _('Skipped') response['status'] = 0 return JsonResponse(response)
def upload_query(request): response = {'status': -1} interface = request.POST.get('interface', OPTIMIZER.INTERFACE.get()) source_platform = request.POST.get('sourcePlatform', 'default') query_id = request.POST.get('query_id') if OPTIMIZER.AUTO_UPLOAD_QUERIES.get() and source_platform in ( 'hive', 'impala') and query_id: try: doc = Document2.objects.document(request.user, doc_id=query_id) query_data = Notebook(document=doc).get_data() queries = _convert_queries([query_data]) source_platform = query_data['snippets'][0]['type'] api = get_api(request, interface) response['query_upload'] = api.upload( data=queries, data_type='queries', source_platform=source_platform) except Document2.DoesNotExist: response['query_upload'] = _('Skipped as task query') else: response['query_upload'] = _('Skipped') response['status'] = 0 return JsonResponse(response)
def test_explain(self): # Hive 2 with Tez set hive.explain.user to true by default, but this test is expecting output when this setting # is set to false. doc = self.create_query_document(owner=self.user, statement=self.statement) notebook = Notebook(document=doc) snippet = self.get_snippet(notebook, snippet_idx=0) snippet['properties']['settings'].append({"key": "hive.explain.user", "value": "false"}) response = self.client.post(reverse('notebook:explain'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('STAGE DEPENDENCIES' in data['explanation'], data) assert_equal(self.statement, data['statement'], data)
def upload_history(request): response = {'status': -1} n = request.POST.get('n') source_platform = request.POST.get('sourcePlatform', 'hive') history = Document2.objects.get_history(doc_type='query-%s' % source_platform, user=request.user) if n: history = history[:n] queries = [] for doc in history: query_data = Notebook(document=doc).get_data() try: original_query_id = '%s:%s' % struct.unpack( b"QQ", base64.decodestring( query_data['snippets'][0]['result']['handle']['guid'])) execution_time = query_data['snippets'][0]['result'][ 'executionTime'] * 100 queries.append((original_query_id, execution_time, query_data['snippets'][0]['statement'])) except Exception, e: LOG.warning('Skipping upload of %s: %s' % (doc, e))
def upload_history(request): response = {'status': -1} if request.user.is_superuser: api = OptimizerApi(request.user) histories = [] upload_stats = {} if request.POST.get('sourcePlatform'): n = min(request.POST.get('n', OPTIMIZER.QUERY_HISTORY_UPLOAD_LIMIT.get())) source_platform = request.POST.get('sourcePlatform', 'hive') histories = [(source_platform, Document2.objects.get_history(doc_type='query-%s' % source_platform, user=request.user)[:n])] elif OPTIMIZER.QUERY_HISTORY_UPLOAD_LIMIT.get() > 0: histories = [ (source_platform, Document2.objects.filter(type='query-%s' % source_platform, is_history=True, is_managed=False, is_trashed=False).order_by('-last_modified')[:OPTIMIZER.QUERY_HISTORY_UPLOAD_LIMIT.get()]) for source_platform in ['hive', 'impala'] ] for source_platform, history in histories: queries = _convert_queries([Notebook(document=doc).get_data() for doc in history]) upload_stats[source_platform] = api.upload(data=queries, data_type='queries', source_platform=source_platform) response['upload_history'] = upload_stats response['status'] = 0 else: response['message'] = _('Query history upload requires Admin privileges or feature is disabled.') return JsonResponse(response)
def test_fetch_result_size_spark(self): if not is_live_cluster() or not is_hive_on_spark(): raise SkipTest # TODO: Add session cleanup here so we don't have orphan spark sessions # Assert that a query with no job will return no rows or size statement = "SELECT 'hello world';" settings = [ { 'key': 'hive.execution.engine', 'value': 'spark' } ] doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0) response = self.client.post(reverse('notebook:fetch_result_size'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true('size' in data['result']) assert_equal(None, data['result']['rows']) assert_equal(None, data['result']['size']) # Assert that a query that runs a job will return rows and size statement = "SELECT app, COUNT(1) AS count FROM web_logs GROUP BY app ORDER BY count DESC;" doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) response = self.client.post(reverse('notebook:fetch_result_size'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true('size' in data['result']) assert_equal(23, data['result']['rows']) assert_true(data['result']['size'] > 0)
def test_download(self): statement = "SELECT 'hello world';" doc = self.create_query_document(owner=self.user, statement=statement) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0) response = self.client.post( reverse('notebook:download'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet), 'format': 'csv' }) assert_equal(200, response.status_code) assert_equal( ('Content-Disposition', 'attachment; filename=Test Query.csv'), response._headers['content-disposition'])
def test_query_with_unicode(self): statement = "SELECT * FROM sample_07 WHERE code='한';" doc = self.create_query_document(owner=self.user, statement=statement) notebook = Notebook(document=doc) snippet = self.get_snippet(notebook, snippet_idx=0) response = self.client.post(reverse('notebook:execute'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) snippet['result']['handle'] = data['handle'] response = self.client.post(reverse('notebook:get_logs'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true("SELECT * FROM sample_07 WHERE code='한'" in smart_str(data['logs']))
def test_fetch_result_size_impala(self): if not is_live_cluster(): raise SkipTest # Create session so that session object is saved to DB for server URL lookup session = self.api.create_session(lang='impala') try: # Assert that a query that runs a job will return rows statement = "SELECT app, COUNT(1) AS count FROM web_logs GROUP BY app ORDER BY count DESC;" doc = self.create_query_document(owner=self.user, query_type='impala', statement=statement) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) self.client.post( reverse('notebook:fetch_result_data'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet), 'rows': 100, 'startOver': 'false' }) response = self.client.post(reverse('notebook:fetch_result_size'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true('size' in data['result']) assert_equal(23, data['result']['rows']) assert_equal(None, data['result']['size']) finally: self.api.close_session(session)
def test_fetch_result_abbreviated(self): if not is_live_cluster(): raise SkipTest # Create session so that session object is saved to DB for server URL lookup session = self.api.create_session(lang='impala') try: # Assert that abbreviated rows returned (e.g. - 1.00K) still returns actual rows statement = "SELECT * FROM web_logs;" doc = self.create_query_document(owner=self.user, query_type='impala', statement=statement) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=5.0) self.client.post( reverse('notebook:fetch_result_data'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet), 'rows': 100, 'startOver': 'false' }) response = self.client.post(reverse('notebook:fetch_result_size'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_equal(1000, data['result']['rows']) finally: self.api.close_session(session)
def test_fetch_result_size_mr(self): if not is_live_cluster(): # Mini-cluster does not have JHS raise SkipTest # Assert that a query with no job will return no rows or size statement = "SELECT 'hello world';" settings = [ { 'key': 'hive.execution.engine', 'value': 'mr' } ] doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0) response = self.client.post(reverse('notebook:fetch_result_size'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true('size' in data['result']) assert_equal(None, data['result']['rows']) assert_equal(None, data['result']['size']) # Assert that a query with map & reduce task returns rows statement = "SELECT DISTINCT code FROM sample_07;" doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) response = self.client.post(reverse('notebook:fetch_result_size'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true('size' in data['result']) assert_equal(823, data['result']['rows']) assert_true(data['result']['size'] > 0, data['result']) # Assert that a query with multiple jobs returns rows statement = "SELECT app, COUNT(1) AS count FROM web_logs GROUP BY app ORDER BY count DESC;" doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) response = self.client.post(reverse('notebook:fetch_result_size'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_equal(23, data['result']['rows']) assert_true(data['result']['size'] > 0, data['result'])
def test_get_sample(self): doc = self.create_query_document(owner=self.user, statement=self.statement) notebook = Notebook(document=doc) snippet = self.get_snippet(notebook, snippet_idx=0) response = self.client.post( reverse('notebook:api_sample_data', kwargs={ 'database': 'default', 'table': 'sample_07' }), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('headers' in data) assert_true('rows' in data) assert_true(len(data['rows']) > 0) response = self.client.post( reverse('notebook:api_sample_data_column', kwargs={ 'database': 'default', 'table': 'sample_07', 'column': 'code' }), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('headers' in data) assert_equal(['code'], data['headers']) assert_true('rows' in data) assert_true(len(data['rows']) > 0)
def test_explain(self): # Hive 2 with Tez set hive.explain.user to true by default, but this test is expecting output when this setting # is set to false. doc = self.create_query_document(owner=self.user, statement=self.statement) notebook = Notebook(document=doc) snippet = self.get_snippet(notebook, snippet_idx=0) snippet['properties']['settings'].append({ "key": "hive.explain.user", "value": "false" }) response = self.client.post(reverse('notebook:explain'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('STAGE DEPENDENCIES' in data['explanation'], data) assert_equal(self.statement, data['statement'], data)
def run_sync_query(doc_id, user): '''Independently run a query as a user.''' # Add INSERT INTO table if persist result # Add variable substitution # Send notifications: done/on failure if type(user) is str: lookup = {orm_user_lookup(): user} user = User.objects.get(**lookup) user = rewrite_user(user) query_document = Document2.objects.get_by_uuid(user=user, uuid=doc_id) notebook = Notebook(document=query_document).get_data() snippet = notebook['snippets'][0] editor_type = snippet['type'] sql = _get_statement(notebook) request = MockedDjangoRequest(user=user) last_executed = time.mktime(datetime.datetime.now().timetuple()) * 1000 notebook = make_notebook(name='Scheduled query %s at %s' % (query_document.name, last_executed), editor_type=editor_type, statement=sql, status='ready', last_executed=last_executed, is_task=True) task = notebook.execute(request, batch=True) task['uuid'] = task['history_uuid'] status = check_status(task) while status['status'] in ('waiting', 'running'): status = check_status(task) time.sleep(3) return task
def run_morphline(self, request, collection_name, morphline, input_path, query=None): workspace_path = self._upload_workspace(morphline) notebook = Notebook( name='Indexer job for %s' % collection_name, isManaged=True ) if query: q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query)) notebook_data = q.get_data() snippet = notebook_data['snippets'][0] api = get_api(request, snippet) destination = '__hue_%s' % notebook_data['uuid'][:4] location = '/user/%s/__hue-%s' % (request.user, notebook_data['uuid'][:4]) sql, success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location) input_path = '${nameNode}%s' % location notebook.add_hive_snippet(snippet['database'], sql) notebook.add_java_snippet( clazz='org.apache.solr.hadoop.MapReduceIndexerTool', app_jar=CONFIG_INDEXER_LIBS_PATH.get(), arguments=[ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, input_path, ], files=[ {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} ] ) return notebook.execute(request, batch=True)
def test_query_with_unicode(self): statement = "SELECT * FROM sample_07 WHERE code='한';" doc = self.create_query_document(owner=self.user, statement=statement) notebook = Notebook(document=doc) snippet = self.get_snippet(notebook, snippet_idx=0) response = self.client.post(reverse('notebook:execute'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) snippet['result']['handle'] = data['handle'] response = self.client.post(reverse('notebook:get_logs'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true("SELECT * FROM sample_07 WHERE code='한'" in smart_str( data['logs']))
def upload_history(request): response = {'status': -1} query_type = 'hive' queries = [ (doc.uuid, 1000, Notebook(document=doc).get_data()['snippets'][0]['statement']) for doc in Document2.objects.get_history(doc_type='query-%s' % query_type, user=request.user)[:25] ] api = OptimizerApi() response['upload_history'] = api.upload(queries=queries, source_platform=query_type) response['status'] = 0 return JsonResponse(response)
def test_fetch_result_size_spark(self): if not is_live_cluster() or not is_hive_on_spark(): raise SkipTest # TODO: Add session cleanup here so we don't have orphan spark sessions # Assert that a query with no job will return no rows or size statement = "SELECT 'hello world';" settings = [{'key': 'hive.execution.engine', 'value': 'spark'}] doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0) response = self.client.post(reverse('notebook:fetch_result_size'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true('size' in data['result']) assert_equal(None, data['result']['rows']) assert_equal(None, data['result']['size']) # Assert that a query that runs a job will return rows and size statement = "SELECT app, COUNT(1) AS count FROM web_logs GROUP BY app ORDER BY count DESC;" doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) response = self.client.post(reverse('notebook:fetch_result_size'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true('size' in data['result']) assert_true(data['result']['rows'] > 0) assert_true(data['result']['size'] > 0)
def test_fetch_result_size_impala(self): if not is_live_cluster(): raise SkipTest # Create session so that session object is saved to DB for server URL lookup session = self.api.create_session(lang='impala') try: # Assert that a query that runs a job will return rows statement = "SELECT app, COUNT(1) AS count FROM web_logs GROUP BY app ORDER BY count DESC;" doc = self.create_query_document(owner=self.user, query_type='impala', statement=statement) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) self.client.post(reverse('notebook:fetch_result_data'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet), 'rows': 100, 'startOver': 'false'}) response = self.client.post(reverse('notebook:fetch_result_size'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true('size' in data['result']) assert_equal(23, data['result']['rows']) assert_equal(None, data['result']['size']) # Assert that selecting all from partitioned table works statement = "SELECT * FROM web_logs;" doc = self.create_query_document(owner=self.user, query_type='impala', statement=statement) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) self.client.post(reverse('notebook:fetch_result_data'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet), 'rows': 100, 'startOver': 'false'}) response = self.client.post(reverse('notebook:fetch_result_size'), {'notebook': notebook.get_json(), 'snippet': json.dumps(snippet)}) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_equal(1000, data['result']['rows']) finally: self.api.close_session(session)
def upload_history(request): response = {'status': -1} n = request.POST.get('n') source_platform = request.POST.get('sourcePlatform', 'hive') history = Document2.objects.get_history(doc_type='query-%s' % source_platform, user=request.user) if n: history = history[:n] queries = _convert_queries( [Notebook(document=doc).get_data() for doc in history]) api = OptimizerApi() response['upload_history'] = api.upload(data=queries, data_type='queries', source_platform=source_platform) response['status'] = 0 return JsonResponse(response)
def execute_and_wait(self, query_doc, snippet_idx=0, timeout=30.0, wait=1.0): notebook = Notebook(document=query_doc) snippet = self.get_snippet(notebook, snippet_idx=snippet_idx) curr = time.time() end = curr + timeout status = 'ready' response = self.client.post(reverse('notebook:execute'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) notebook = Notebook(document=query_doc) snippet = self.get_snippet(notebook, snippet_idx=snippet_idx) data = json.loads(response.content) snippet['result']['handle'] = data['handle'] while status != 'available' and curr <= end: response = self.client.post(reverse('notebook:check_status'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) status = data['query_status']['status'] snippet['status'] = status time.sleep(wait) curr = time.time() if status != 'available': raise Exception('Query failed to complete or return results.') return snippet
class Submission(object): """ Represents one unique Oozie submission. Actions are: - submit - rerun """ def __init__(self, user, job=None, fs=None, jt=None, properties=None, oozie_id=None, local_tz=None): self.job = job self.user = user self.fs = fs self.jt = jt # Deprecated with YARN, we now use logical names only for RM self.oozie_id = oozie_id self.api = get_oozie(self.user) if properties is not None: self.properties = properties else: self.properties = {} if local_tz and isinstance(self.job.data, dict): local_tz = self.job.data.get('properties')['timezone'] # Modify start_date & end_date only when it's a coordinator from oozie.models2 import Coordinator if type(self.job) is Coordinator: if 'start_date' in self.properties: properties['start_date'] = convert_to_server_timezone( self.properties['start_date'], local_tz) if 'end_date' in self.properties: properties['end_date'] = convert_to_server_timezone( self.properties['end_date'], local_tz) if 'nominal_time' in self.properties: properties['nominal_time'] = convert_to_server_timezone( self.properties['nominal_time'], local_tz) self.properties['security_enabled'] = self.api.security_enabled def __str__(self): if self.oozie_id: res = "Submission for job '%s'." % (self.oozie_id, ) else: res = "Submission for job '%s' (id %s, owner %s)." % ( self.job.name, self.job.id, self.user) if self.oozie_id: res += " -- " + self.oozie_id return res @submit_dryrun def run(self, deployment_dir=None): """ Take care of all the actions of submitting a Oozie workflow. Returns the oozie job id if all goes well. """ if self.properties and 'oozie.use.system.libpath' not in self.properties: self.properties['oozie.use.system.libpath'] = 'true' self.oozie_id = self.api.submit_job(self.properties) LOG.info("Submitted: %s" % (self, )) if self._is_workflow(): self.api.job_control(self.oozie_id, 'start') LOG.info("Started: %s" % (self, )) return self.oozie_id def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.wf.application.path': deployment_dir}) if 'oozie.coord.application.path' in self.properties: self.properties.pop('oozie.coord.application.path') if 'oozie.bundle.application.path' in self.properties: self.properties.pop('oozie.bundle.application.path') if fail_nodes: self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes}) elif not skip_nodes: self.properties.update({'oozie.wf.rerun.failnodes': 'false'}) # Case empty 'skip_nodes' list else: self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes}) self.api.rerun(self.oozie_id, properties=self.properties) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def rerun_coord(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.coord.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='coord-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def update_coord(self): self.api = get_oozie(self.user, api_version="v2") self.api.job_control(self.oozie_id, action='update', properties=self.properties, parameters=None) LOG.info("Update: %s" % (self, )) return self.oozie_id def rerun_bundle(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.bundle.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='bundle-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def deploy(self, deployment_dir=None): try: if not deployment_dir: deployment_dir = self._create_deployment_dir() except Exception, ex: msg = _("Failed to create deployment directory: %s" % ex) LOG.exception(msg) raise PopupException(message=msg, detail=str(ex)) if self.api.security_enabled: jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties( jt_address ) # Needed for coordinator deploying workflows with credentials if hasattr(self.job, 'nodes'): for action in self.job.nodes: # Make sure XML is there # Don't support more than one level sub-workflow if action.data['type'] == 'subworkflow': from oozie.models2 import Workflow workflow = Workflow(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['workflow'])) sub_deploy = Submission(self.user, workflow, self.fs, self.jt, self.properties) workspace = sub_deploy.deploy() self.job.override_subworkflow_id( action, workflow.id) # For displaying the correct graph self.properties[ 'workspace_%s' % workflow. uuid] = workspace # For pointing to the correct workspace elif action.data['type'] == 'altus': self._create_file( deployment_dir, action.data['name'] + '.sh', '''#!/usr/bin/env bash export PYTHONPATH=`pwd` echo 'Starting Altus command...' python altus.py ''') shell_script = self._generate_altus_action_script( service=action.data['properties'].get('service'), command=action.data['properties'].get('command'), arguments=dict([ arg.split('=', 1) for arg in action.data['properties'].get( 'arguments', []) ]), auth_key_id=ALTUS.AUTH_KEY_ID.get(), auth_key_secret=ALTUS.AUTH_KEY_SECRET.get().replace( '\\n', '\n')) self._create_file(deployment_dir, 'altus.py', shell_script) ext_py_lib_path = os.path.join(get_desktop_root(), 'core', 'ext-py') lib_dir_path = os.path.join(self.job.deployment_dir, 'lib') libs = [ (os.path.join(ext_py_lib_path, 'navoptapi-0.1.0'), 'navoptapi'), (os.path.join(ext_py_lib_path, 'navoptapi-0.1.0'), 'altuscli'), (os.path.join(ext_py_lib_path, 'asn1crypto-0.24.0'), 'asn1crypto'), (os.path.join(ext_py_lib_path, 'rsa-3.4.2'), 'rsa'), (os.path.join(ext_py_lib_path, 'pyasn1-0.1.8'), 'pyasn1'), ] for source_path, name in libs: destination_path = os.path.join(lib_dir_path, name) if not self.fs.do_as_user(self.user, self.fs.exists, destination_path): # Note: would be much faster to have only one zip archive self.fs.do_as_user(self.user, self.fs.copyFromLocal, os.path.join(source_path, name), destination_path) elif action.data['type'] == 'impala' or action.data[ 'type'] == 'impala-document': from oozie.models2 import _get_impala_url from impala.impala_flags import get_ssl_server_certificate if action.data['type'] == 'impala-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() statements = Template(statements).safe_substitute( **self.properties) script_name = action.data['name'] + '.sql' self._create_file(deployment_dir, script_name, statements) else: script_name = os.path.basename( action.data['properties'].get('script_path')) if self.api.security_enabled: kinit = 'kinit -k -t *.keytab %(user_principal)s' % { 'user_principal': self.properties.get( 'user_principal', action.data['properties']. get('user_principal')) } else: kinit = '' shell_script = """#!/bin/bash # Needed to launch impala shell in oozie export PYTHON_EGG_CACHE=./myeggs %(kinit)s impala-shell %(kerberos_option)s %(ssl_option)s -i %(impalad_host)s -f %(query_file)s""" % { 'impalad_host': action.data['properties'].get('impalad_host') or _get_impala_url(), 'kerberos_option': '-k' if self.api.security_enabled else '', 'ssl_option': '--ssl' if get_ssl_server_certificate() else '', 'query_file': script_name, 'kinit': kinit } self._create_file(deployment_dir, action.data['name'] + '.sh', shell_script) elif action.data['type'] == 'hive-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() else: statements = action.data['properties'].get( 'statements') if self.properties.get('send_result_path'): statements = """ INSERT OVERWRITE DIRECTORY '%s' ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = "\t", "quoteChar" = "'", "escapeChar" = "\\" ) STORED AS TEXTFILE %s""" % (self.properties.get('send_result_path'), '\n\n\n'.join([ snippet['statement_raw'] for snippet in notebook.get_data()['snippets'] ])) if statements is not None: self._create_file(deployment_dir, action.data['name'] + '.sql', statements) elif action.data['type'] in ('java-document', 'java', 'mapreduce-document'): if action.data['type'] == 'java-document' or action.data[ 'type'] == 'mapreduce-document': from notebook.models import Notebook notebook = Notebook( document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['uuid'])) properties = notebook.get_data( )['snippets'][0]['properties'] else: properties = action.data['properties'] if properties.get('app_jar'): LOG.debug("Adding to oozie.libpath %s" % properties['app_jar']) paths = [properties['app_jar']] if self.properties.get('oozie.libpath'): paths.append(self.properties['oozie.libpath']) self.properties['oozie.libpath'] = ','.join(paths) elif action.data['type'] == 'pig-document': from notebook.models import Notebook notebook = Notebook(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties'] ['uuid'])) statements = notebook.get_data( )['snippets'][0]['statement_raw'] self._create_file(deployment_dir, action.data['name'] + '.pig', statements) elif action.data['type'] in ('spark', 'spark-document') or ( action.data['type'] in ('sqoop', 'sqoop-document') and action.data['properties']['statement'] in '--hive-import'): if not [ f for f in action.data.get('properties').get( 'files', []) if f.get('value').endswith('hive-site.xml') ]: hive_site_lib = Hdfs.join(deployment_dir + '/lib/', 'hive-site.xml') hive_site_content = get_hive_site_content() if not self.fs.do_as_user( self.user, self.fs.exists, hive_site_lib) and hive_site_content: self.fs.do_as_user( self.user, self.fs.create, hive_site_lib, overwrite=True, permission=0700, data=smart_str(hive_site_content)) if action.data['type'] in ('sqoop', 'sqoop-document'): if CONFIG_JDBC_LIBS_PATH.get( ) and CONFIG_JDBC_LIBS_PATH.get( ) not in self.properties.get('oozie.libpath', ''): LOG.debug("Adding to oozie.libpath %s" % CONFIG_JDBC_LIBS_PATH.get()) paths = [CONFIG_JDBC_LIBS_PATH.get()] if self.properties.get('oozie.libpath'): paths.append(self.properties['oozie.libpath']) self.properties['oozie.libpath'] = ','.join(paths) oozie_xml = self.job.to_xml(self.properties) self._do_as(self.user.username, self._copy_files, deployment_dir, oozie_xml, self.properties) return deployment_dir
class Submission(object): """ Represents one unique Oozie submission. Actions are: - submit - rerun """ def __init__(self, user, job=None, fs=None, jt=None, properties=None, oozie_id=None, local_tz=None): self.job = job self.user = user self.fs = fs self.jt = jt # Deprecated with YARN, we now use logical names only for RM self.oozie_id = oozie_id self.api = get_oozie(self.user) if properties is not None: self.properties = properties else: self.properties = {} if local_tz and isinstance(self.job.data, dict): local_tz = self.job.data.get('properties')['timezone'] # Modify start_date & end_date only when it's a coordinator from oozie.models2 import Coordinator if type(self.job) is Coordinator: if 'start_date' in self.properties: properties['start_date'] = convert_to_server_timezone(self.properties['start_date'], local_tz) if 'end_date' in self.properties: properties['end_date'] = convert_to_server_timezone(self.properties['end_date'], local_tz) if 'nominal_time' in self.properties: properties['nominal_time'] = convert_to_server_timezone(self.properties['nominal_time'], local_tz) self.properties['security_enabled'] = self.api.security_enabled def __str__(self): if self.oozie_id: res = "Submission for job '%s'." % (self.oozie_id,) else: res = "Submission for job '%s' (id %s, owner %s)." % (self.job.name, self.job.id, self.user) if self.oozie_id: res += " -- " + self.oozie_id return res @submit_dryrun def run(self, deployment_dir=None): """ Take care of all the actions of submitting a Oozie workflow. Returns the oozie job id if all goes well. """ if self.properties and 'oozie.use.system.libpath' not in self.properties: self.properties['oozie.use.system.libpath'] = 'true' self.oozie_id = self.api.submit_job(self.properties) LOG.info("Submitted: %s" % (self,)) if self._is_workflow(): self.api.job_control(self.oozie_id, 'start') LOG.info("Started: %s" % (self,)) return self.oozie_id def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.wf.application.path': deployment_dir}) if 'oozie.coord.application.path' in self.properties: self.properties.pop('oozie.coord.application.path') if 'oozie.bundle.application.path' in self.properties: self.properties.pop('oozie.bundle.application.path') if fail_nodes: self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes}) elif not skip_nodes: self.properties.update({'oozie.wf.rerun.failnodes': 'false'}) # Case empty 'skip_nodes' list else: self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes}) self.api.rerun(self.oozie_id, properties=self.properties) LOG.info("Rerun: %s" % (self,)) return self.oozie_id def rerun_coord(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.coord.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='coord-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self,)) return self.oozie_id def update_coord(self): self.api = get_oozie(self.user, api_version="v2") self.api.job_control(self.oozie_id, action='update', properties=self.properties, parameters=None) LOG.info("Update: %s" % (self,)) return self.oozie_id def rerun_bundle(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.bundle.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='bundle-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self,)) return self.oozie_id def deploy(self, deployment_dir=None): try: if not deployment_dir: deployment_dir = self._create_deployment_dir() except Exception, ex: msg = _("Failed to create deployment directory: %s" % ex) LOG.exception(msg) raise PopupException(message=msg, detail=str(ex)) if self.api.security_enabled: jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address) # Needed for coordinator deploying workflows with credentials if hasattr(self.job, 'nodes'): for action in self.job.nodes: # Make sure XML is there # Don't support more than one level sub-workflow if action.data['type'] == 'subworkflow': from oozie.models2 import Workflow workflow = Workflow(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['workflow'])) sub_deploy = Submission(self.user, workflow, self.fs, self.jt, self.properties) workspace = sub_deploy.deploy() self.job.override_subworkflow_id(action, workflow.id) # For displaying the correct graph self.properties['workspace_%s' % workflow.uuid] = workspace # For pointing to the correct workspace elif action.data['type'] == 'impala' or action.data['type'] == 'impala-document': from oozie.models2 import _get_impala_url from impala.impala_flags import get_ssl_server_certificate if action.data['type'] == 'impala-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() statements = Template(statements).safe_substitute(**self.properties) script_name = action.data['name'] + '.sql' self._create_file(deployment_dir, script_name, statements) else: script_name = os.path.basename(action.data['properties'].get('script_path')) if self.api.security_enabled: kinit = 'kinit -k -t *.keytab %(user_principal)s' % { 'user_principal': self.properties.get('user_principal', action.data['properties'].get('user_principal')) } else: kinit = '' shell_script = """#!/bin/bash # Needed to launch impala shell in oozie export PYTHON_EGG_CACHE=./myeggs %(kinit)s impala-shell %(kerberos_option)s %(ssl_option)s -i %(impalad_host)s -f %(query_file)s""" % { 'impalad_host': action.data['properties'].get('impalad_host') or _get_impala_url(), 'kerberos_option': '-k' if self.api.security_enabled else '', 'ssl_option': '--ssl' if get_ssl_server_certificate() else '', 'query_file': script_name, 'kinit': kinit } self._create_file(deployment_dir, action.data['name'] + '.sh', shell_script) elif action.data['type'] == 'hive-document': from notebook.models import Notebook if action.data['properties'].get('uuid'): notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_str() else: statements = action.data['properties'].get('statements') if self.properties.get('send_result_path'): statements = """ INSERT OVERWRITE DIRECTORY '%s' ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = "\t", "quoteChar" = "'", "escapeChar" = "\\" ) STORED AS TEXTFILE %s""" % (self.properties.get('send_result_path'), '\n\n\n'.join([snippet['statement_raw'] for snippet in notebook.get_data()['snippets']])) if statements is not None: self._create_file(deployment_dir, action.data['name'] + '.sql', statements) elif action.data['type'] in ('java-document', 'java', 'mapreduce-document'): if action.data['type'] == 'java-document' or action.data['type'] == 'mapreduce-document': from notebook.models import Notebook notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid'])) properties = notebook.get_data()['snippets'][0]['properties'] else: properties = action.data['properties'] if properties.get('app_jar'): LOG.debug("Adding to oozie.libpath %s" % properties['app_jar']) paths = [properties['app_jar']] if self.properties.get('oozie.libpath'): paths.append(self.properties['oozie.libpath']) self.properties['oozie.libpath'] = ','.join(paths) elif action.data['type'] == 'pig-document': from notebook.models import Notebook notebook = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=action.data['properties']['uuid'])) statements = notebook.get_data()['snippets'][0]['statement_raw'] self._create_file(deployment_dir, action.data['name'] + '.pig', statements) oozie_xml = self.job.to_xml(self.properties) self._do_as(self.user.username, self._copy_files, deployment_dir, oozie_xml, self.properties) return deployment_dir
def test_fetch_result_size_mr(self): if not is_live_cluster(): # Mini-cluster does not have JHS raise SkipTest # Assert that a query with no job will return no rows or size statement = "SELECT 'hello world';" settings = [{'key': 'hive.execution.engine', 'value': 'mr'}] doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0) response = self.client.post(reverse('notebook:fetch_result_size'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true('size' in data['result']) assert_equal(None, data['result']['rows']) assert_equal(None, data['result']['size']) # Assert that a query with map & reduce task returns rows statement = "SELECT DISTINCT code FROM sample_07;" doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) response = self.client.post(reverse('notebook:fetch_result_size'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true(data['result']['rows'] > 0) # Assert that a query with multiple jobs returns rows statement = "SELECT app, COUNT(1) AS count FROM web_logs GROUP BY app ORDER BY count DESC;" doc = self.create_query_document(owner=self.user, statement=statement, settings=settings) notebook = Notebook(document=doc) snippet = self.execute_and_wait(doc, snippet_idx=0, timeout=60.0, wait=2.0) response = self.client.post(reverse('notebook:fetch_result_size'), { 'notebook': notebook.get_json(), 'snippet': json.dumps(snippet) }) data = json.loads(response.content) assert_equal(0, data['status'], data) assert_true('result' in data) assert_true('rows' in data['result']) assert_true(data['result']['rows'] > 0)
class Submission(object): """ Represents one unique Oozie submission. Actions are: - submit - rerun """ def __init__(self, user, job=None, fs=None, jt=None, properties=None, oozie_id=None, local_tz=None): self.job = job self.user = user self.fs = fs self.jt = jt # Deprecated with YARN, we now use logical names only for RM self.oozie_id = oozie_id self.api = get_oozie(self.user) if properties is not None: self.properties = properties else: self.properties = {} if local_tz and isinstance(self.job.data, dict): local_tz = self.job.data.get('properties')['timezone'] # Modify start_date & end_date only when it's a coordinator from oozie.models2 import Coordinator if type(self.job) is Coordinator: if 'start_date' in self.properties: properties['start_date'] = convert_to_server_timezone( self.properties['start_date'], local_tz) if 'end_date' in self.properties: properties['end_date'] = convert_to_server_timezone( self.properties['end_date'], local_tz) self.properties['security_enabled'] = self.api.security_enabled def __str__(self): if self.oozie_id: res = "Submission for job '%s'." % (self.oozie_id, ) else: res = "Submission for job '%s' (id %s, owner %s)." % ( self.job.name, self.job.id, self.user) if self.oozie_id: res += " -- " + self.oozie_id return res @submit_dryrun def run(self, deployment_dir=None): """ Take care of all the actions of submitting a Oozie workflow. Returns the oozie job id if all goes well. """ if self.properties and 'oozie.use.system.libpath' not in self.properties: self.properties['oozie.use.system.libpath'] = 'true' self.oozie_id = self.api.submit_job(self.properties) LOG.info("Submitted: %s" % (self, )) if self._is_workflow(): self.api.job_control(self.oozie_id, 'start') LOG.info("Started: %s" % (self, )) return self.oozie_id def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update({'oozie.wf.application.path': deployment_dir}) if 'oozie.coord.application.path' in self.properties: self.properties.pop('oozie.coord.application.path') if fail_nodes: self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes}) elif not skip_nodes: self.properties.update({'oozie.wf.rerun.failnodes': 'false'}) # Case empty 'skip_nodes' list else: self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes}) self.api.rerun(self.oozie_id, properties=self.properties) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def rerun_coord(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.coord.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='coord-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def update_coord(self): self.api = get_oozie(self.user, api_version="v2") self.api.job_control(self.oozie_id, action='update', properties=self.properties, parameters=None) LOG.info("Update: %s" % (self, )) return self.oozie_id def rerun_bundle(self, deployment_dir, params): jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties(jt_address, deployment_dir) self.properties.update( {'oozie.bundle.application.path': deployment_dir}) self.api.job_control(self.oozie_id, action='bundle-rerun', properties=self.properties, parameters=params) LOG.info("Rerun: %s" % (self, )) return self.oozie_id def deploy(self): try: deployment_dir = self._create_deployment_dir() except Exception, ex: msg = _("Failed to create deployment directory: %s" % ex) LOG.exception(msg) raise PopupException(message=msg, detail=str(ex)) if self.api.security_enabled: jt_address = cluster.get_cluster_addr_for_job_submission() self._update_properties( jt_address ) # Needed for coordinator deploying workflows with credentials if hasattr(self.job, 'nodes'): for action in self.job.nodes: # Make sure XML is there # Don't support more than one level sub-workflow if action.data['type'] == 'subworkflow': from oozie.models2 import Workflow workflow = Workflow(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties']['workflow'])) sub_deploy = Submission(self.user, workflow, self.fs, self.jt, self.properties) workspace = sub_deploy.deploy() self.job.override_subworkflow_id( action, workflow.id) # For displaying the correct graph self.properties[ 'workspace_%s' % workflow. uuid] = workspace # For pointing to the correct workspace elif action.data['type'] == 'hive-document': from notebook.models import Notebook notebook = Notebook(document=Document2.objects.get_by_uuid( user=self.user, uuid=action.data['properties'] ['uuid'])) self._create_file(deployment_dir, action.data['name'] + '.sql', notebook.get_str()) #self.data['properties']['script_path'] = _generate_hive_script(self.data['uuid']) #'workspace_%s' % workflow.uui oozie_xml = self.job.to_xml(self.properties) self._do_as(self.user.username, self._copy_files, deployment_dir, oozie_xml, self.properties) return deployment_dir