def test_list(self): testfile = 'abfs://' filesystems = self.client.listdir(testfile) LOG.debug("%s" % filesystems) assert_true(filesystems is not None, filesystems) pathing = self.client.listdir(testfile + filesystems[0], {"recursive": "true"}) LOG.debug("%s" % pathing) assert_true(pathing is not None, pathing) directory = self.client.listdir(testfile + filesystems[0] + '/' + pathing[0]) LOG.debug("%s" % directory) assert_true(directory is not None, directory) directory = self.client.listdir(self.test_fs) LOG.debug("%s" % directory) assert_true(directory is not None, directory) directory = self.client.listdir(abfspath(self.test_fs)) LOG.debug("%s" % directory) assert_true(directory is not None, directory) pathing = self.client._statsf(filesystems[276]) LOG.debug("%s" % pathing) assert_true(pathing is not None, pathing) pathing = self.client._statsf(filesystems[277]) LOG.debug("%s" % pathing) assert_true(pathing is not None, pathing)
def submit_external_job(request, application_path): ParametersFormSet = formset_factory(ParameterForm, extra=0) if application_path.startswith('abfs:/') and not application_path.startswith('abfs://'): application_path = application_path.replace("abfs:/", "abfs://") elif application_path.startswith('s3a:/') and not application_path.startswith('s3a://'): application_path = application_path.replace('s3a:/', 's3a://') else: application_path = "/" + application_path if application_path.startswith("abfs://"): application_path = abfspath(application_path) if request.method == 'POST': params_form = ParametersFormSet(request.POST) if params_form.is_valid(): mapping = dict([(param['name'], param['value']) for param in params_form.cleaned_data]) mapping['dryrun'] = request.POST.get('dryrun_checkbox') == 'on' application_name = os.path.basename(application_path) application_class = Bundle if application_name == 'bundle.xml' else Coordinator if application_name == 'coordinator.xml' else get_workflow() mapping[application_class.get_application_path_key()] = os.path.dirname(application_path) try: submission = Submission(request.user, fs=request.fs, jt=request.jt, properties=mapping) job_id = submission.run(application_path) except RestException as ex: detail = ex._headers.get('oozie-error-message', ex) if 'Max retries exceeded with url' in str(detail): detail = '%s: %s' % (_('The Oozie server is not running'), detail) LOG.exception(smart_str(detail)) raise PopupException(_("Error submitting job %s") % (application_path,), detail=detail) jsonify = request.POST.get('format') == 'json' if jsonify: return JsonResponse({'status': 0, 'job_id': job_id, 'type': 'external_workflow'}, safe=False) else: request.info(_('Oozie job submitted')) view = 'list_oozie_bundle' if application_name == 'bundle.xml' else 'list_oozie_coordinator' if application_name == 'coordinator.xml' else 'list_oozie_workflow' return redirect(reverse('oozie:%s' % view, kwargs={'job_id': job_id})) else: request.error(_('Invalid submission form: %s' % params_form.errors)) else: parameters = Submission(request.user, fs=request.fs, jt=request.jt).get_external_parameters(application_path) initial_params = ParameterForm.get_initial_params(parameters) params_form = ParametersFormSet(initial=initial_params) popup = render('editor/submit_job_popup.mako', request, { 'params_form': params_form, 'name': _('Job'), 'action': reverse('oozie:submit_external_job', kwargs={'application_path': application_path}), 'show_dryrun': os.path.basename(application_path) != 'bundle.xml', 'return_json': request.GET.get('format') == 'json' }, force_template=True).content return JsonResponse(popup, safe=False)
def create_table_from_a_file(self, source, destination, start_time=-1, file_encoding=None): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name table_format = destination['tableFormat'] source_type = source['sourceType'] columns = destination['columns'] partition_columns = destination['partitionColumns'] kudu_partition_columns = destination['kuduPartitionColumns'] comment = destination['description'] source_path = urllib_unquote(source['path']) load_data = destination['importData'] external = not destination['useDefaultLocation'] external_path = urllib_unquote(destination['nonDefaultLocation']) editor_type = destination['sourceType'] is_transactional = destination['isTransactional'] default_transactional_type = 'insert_only' if destination[ 'isInsertOnly'] else 'default' skip_header = destination['hasHeader'] primary_keys = destination['primaryKeys'] if destination['useCustomDelimiters']: field_delimiter = destination['customFieldDelimiter'] collection_delimiter = destination[ 'customCollectionDelimiter'] or None map_delimiter = destination['customMapDelimiter'] or None else: field_delimiter = ',' collection_delimiter = r'\002' map_delimiter = r'\003' regexp_delimiter = destination['customRegexp'] file_format = 'TextFile' row_format = 'Delimited' serde_name = '' serde_properties = '' extra_create_properties = '' sql = '' if source['inputFormat'] == 'manual': load_data = False source['format'] = {'quoteChar': '"', 'fieldSeparator': ','} if table_format == 'json': row_format = 'serde' serde_name = 'org.apache.hive.hcatalog.data.JsonSerDe' elif table_format == 'regexp': row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.RegexSerDe' serde_properties = '"input.regex" = "%s"' % regexp_delimiter elif table_format == 'csv': if source['format']['quoteChar'] == '"': source['format']['quoteChar'] = '\\"' row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.OpenCSVSerde' serde_properties = '''"separatorChar" = "%(fieldSeparator)s", "quoteChar" = "%(quoteChar)s", "escapeChar" = "\\\\" ''' % source['format'] use_temp_table = table_format in ('parquet', 'orc', 'kudu') or is_transactional if use_temp_table: # We'll be using a temp table to load data if load_data: table_name, final_table_name = 'hue__tmp_%s' % table_name, table_name sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } else: # Manual row_format = '' file_format = table_format skip_header = False if table_format == 'kudu': columns = [ col for col in columns if col['name'] in primary_keys ] + [ col for col in columns if col['name'] not in primary_keys ] if table_format == 'kudu': collection_delimiter = None map_delimiter = None if external or (load_data and table_format in ( 'parquet', 'orc', 'kudu')): # We'll use location to load data if not self.fs.isdir(external_path): # File selected external_path, external_file_name = Hdfs.split(external_path) if len(self.fs.listdir(external_path)) > 1: # If dir not just the file, create data dir and move file there. Make sure it's unique. external_path = external_path + '/%s%s_table' % ( external_file_name, str(uuid.uuid4())) self.fs.mkdir(external_path) self.fs.rename(source_path, external_path) elif load_data: # We'll use load data command parent_path = self.fs.parent_path(source_path) stats = self.fs.stats(parent_path) split = urlparse(source_path) # Only for HDFS, import data and non-external table if split.scheme in ('', 'hdfs') and oct(stats["mode"])[-1] != '7': user_scratch_dir = self.fs.get_home_dir( ) + '/.scratchdir/%s' % str( uuid.uuid4()) # Make sure it's unique. self.fs.do_as_user(self.user, self.fs.mkdir, user_scratch_dir, 0o0777) self.fs.do_as_user(self.user, self.fs.rename, source['path'], user_scratch_dir) source_path = user_scratch_dir + '/' + source['path'].split( '/')[-1] if external_path.lower().startswith( "abfs"): #this is to check if its using an ABFS path external_path = abfspath(external_path) tbl_properties = OrderedDict() if skip_header: tbl_properties['skip.header.line.count'] = '1' # The temp table is not transactional, but final table can be if is_transactional. # tbl_properties that don't exist in previous versions can safely be added without error. tbl_properties['transactional'] = 'false' sql += django_mako.render_to_string( "gen/create_table_statement.mako", { 'table': { 'name': table_name, 'comment': comment, 'row_format': row_format, 'field_terminator': field_delimiter, 'collection_terminator': collection_delimiter if source_type == 'hive' else None, 'map_key_terminator': map_delimiter if source_type == 'hive' else None, 'serde_name': serde_name, 'serde_properties': serde_properties, 'file_format': file_format, 'external': external or load_data and table_format in ('parquet', 'orc', 'kudu'), 'path': external_path, 'primary_keys': primary_keys if table_format == 'kudu' and not load_data else [], 'tbl_properties': tbl_properties }, 'columns': columns, 'partition_columns': partition_columns, 'kudu_partition_columns': kudu_partition_columns, 'database': database }) if file_encoding and file_encoding != 'ASCII' and file_encoding != 'utf-8' and not use_temp_table: sql += '\n\nALTER TABLE `%(database)s`.`%(final_table_name)s` ' \ 'SET serdeproperties ("serialization.encoding"="%(file_encoding)s");' % { 'database': database, 'final_table_name': final_table_name, 'file_encoding': file_encoding } if table_format in ('text', 'json', 'csv', 'regexp') and not external and load_data: form_data = { 'path': source_path, 'overwrite': False, 'partition_columns': [(partition['name'], partition['partitionValue']) for partition in partition_columns], } query_server_config = dbms.get_query_server_config( name=source_type) db = dbms.get(self.user, query_server=query_server_config) sql += "\n\n%s;" % db.load_data( database, table_name, form_data, None, generate_ddl_only=True) if load_data and use_temp_table: file_format = 'TextFile' if table_format == 'text' else table_format if table_format == 'kudu': columns_list = [ '`%s`' % col for col in primary_keys + [ col['name'] for col in destination['columns'] if col['name'] not in primary_keys and col['keep'] ] ] extra_create_properties = """PRIMARY KEY (%(primary_keys)s) PARTITION BY HASH PARTITIONS 16 STORED AS %(file_format)s TBLPROPERTIES( 'kudu.num_tablet_replicas' = '1' )""" % { 'file_format': file_format, 'primary_keys': ', '.join(primary_keys) } else: columns_list = ['*'] extra_create_properties = 'STORED AS %(file_format)s' % { 'file_format': file_format } if is_transactional: extra_create_properties += '\nTBLPROPERTIES("transactional"="true", "transactional_properties"="%s")' % \ default_transactional_type sql += '''\n\nCREATE TABLE `%(database)s`.`%(final_table_name)s`%(comment)s %(extra_create_properties)s AS SELECT %(columns_list)s FROM `%(database)s`.`%(table_name)s`;''' % { 'database': database, 'final_table_name': final_table_name, 'table_name': table_name, 'extra_create_properties': extra_create_properties, 'columns_list': ', '.join(columns_list), 'comment': ' COMMENT "%s"' % comment if comment else '' } sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } if file_encoding and file_encoding != 'ASCII' and file_encoding != 'utf-8': sql += '\n\nALTER TABLE `%(database)s`.`%(final_table_name)s` ' \ 'SET serdeproperties ("serialization.encoding"="%(file_encoding)s");' % { 'database': database, 'final_table_name': final_table_name, 'file_encoding': file_encoding } on_success_url = reverse('metastore:describe_table', kwargs={ 'database': database, 'table': final_table_name }) + '?source_type=' + source_type return make_notebook(name=_('Creating table %(database)s.%(table)s') % { 'database': database, 'table': final_table_name }, editor_type=editor_type, statement=sql.strip(), status='ready', database=database, on_success_url=on_success_url, last_executed=start_time, is_task=True)
def export_result(request): response = {'status': -1, 'message': _('Success')} # Passed by check_document_access_permission but unused by APIs notebook = json.loads(request.POST.get('notebook', '{}')) snippet = json.loads(request.POST.get('snippet', '{}')) data_format = json.loads(request.POST.get('format', '"hdfs-file"')) destination = urllib_unquote( json.loads(request.POST.get('destination', '""'))) overwrite = json.loads(request.POST.get('overwrite', 'false')) is_embedded = json.loads(request.POST.get('is_embedded', 'false')) start_time = json.loads(request.POST.get('start_time', '-1')) api = get_api(request, snippet) if data_format == 'hdfs-file': # Blocking operation, like downloading if request.fs.isdir(destination): if notebook.get('name'): destination += '/%(name)s.csv' % notebook else: destination += '/%(type)s-%(id)s.csv' % notebook if overwrite and request.fs.exists(destination): request.fs.do_as_user(request.user.username, request.fs.rmtree, destination) response['watch_url'] = api.export_data_as_hdfs_file( snippet, destination, overwrite) response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS destination: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hive-table': if is_embedded: sql, success_url = api.export_data_as_table( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to table %s') % (snippet['type'], destination), description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=save_as_table¬ebook=' + str(notebook_id) + \ '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to Hive table: %s' % (request.user.username, destination), 'allowed': True } elif data_format == 'hdfs-directory': if destination.lower().startswith("abfs"): destination = abfspath(destination) if request.fs.exists(destination) and request.fs.listdir_stats( destination): raise PopupException( _('The destination is not an empty directory!')) if is_embedded: sql, success_url = api.export_large_data_to_hdfs( notebook, snippet, destination) task = make_notebook(name=_('Export %s query to directory') % snippet['type'], description=_('Query %s to %s') % (_get_snippet_name(notebook), success_url), editor_type=snippet['type'], statement=sql, status='ready-execute', database=snippet['database'], on_success_url=success_url, last_executed=start_time, is_task=True) response = task.execute(request) else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=insert_as_query¬ebook=' + str(notebook_id) + \ '&snippet=0&destination=' + destination response['status'] = 0 request.audit = { 'operation': 'EXPORT', 'operationText': 'User %s exported to HDFS directory: %s' % (request.user.username, destination), 'allowed': True } elif data_format in ('search-index', 'dashboard'): # Open the result in the Dashboard via a SQL sub-query or the Import wizard (quick vs scalable) if is_embedded: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) if data_format == 'dashboard': engine = notebook['type'].replace('query-', '') response['watch_url'] = reverse( 'dashboard:browse', kwargs={ 'name': notebook_id }) + '?source=query&engine=%(engine)s' % { 'engine': engine } response['status'] = 0 else: sample = get_api(request, snippet).fetch_result(notebook, snippet, rows=4, start_over=True) for col in sample['meta']: col['type'] = HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string') response['status'] = 0 response['id'] = notebook_id response['name'] = _get_snippet_name(notebook) response['source_type'] = 'query' response['target_type'] = 'index' response['target_path'] = destination response['sample'] = list(sample['data']) response['columns'] = [ Field(col['name'], col['type']).to_dict() for col in sample['meta'] ] else: notebook_id = notebook['id'] or request.GET.get( 'editor', request.GET.get('notebook')) response['watch_url'] = reverse('notebook:execute_and_watch') + '?action=index_query¬ebook=' + str(notebook_id) + \ '&snippet=0&destination=' + destination response['status'] = 0 if response.get('status') != 0: response['message'] = _('Exporting result failed.') return JsonResponse(response)
def create_table_from_a_file(self, source, destination, start_time=-1): if '.' in destination['name']: database, table_name = destination['name'].split('.', 1) else: database = 'default' table_name = destination['name'] final_table_name = table_name table_format = destination['tableFormat'] source_type = source['sourceType'] columns = destination['columns'] partition_columns = destination['partitionColumns'] kudu_partition_columns = destination['kuduPartitionColumns'] comment = destination['description'] source_path = urllib_unquote(source['path']) external = not destination['useDefaultLocation'] external_path = urllib_unquote(destination['nonDefaultLocation']) load_data = destination['importData'] skip_header = destination['hasHeader'] primary_keys = destination['primaryKeys'] if destination['useCustomDelimiters']: field_delimiter = destination['customFieldDelimiter'] collection_delimiter = destination['customCollectionDelimiter'] map_delimiter = destination['customMapDelimiter'] else: field_delimiter = ',' collection_delimiter = r'\002' map_delimiter = r'\003' regexp_delimiter = destination['customRegexp'] file_format = 'TextFile' row_format = 'Delimited' serde_name = '' serde_properties = '' extra_create_properties = '' sql = '' if source['inputFormat'] == 'manual': load_data = False source['format'] = {'quoteChar': '"', 'fieldSeparator': ','} if table_format == 'json': row_format = 'serde' serde_name = 'org.apache.hive.hcatalog.data.JsonSerDe' elif table_format == 'regexp': row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.RegexSerDe' serde_properties = '"input.regex" = "%s"' % regexp_delimiter elif table_format == 'csv': if source['format']['quoteChar'] == '"': source['format']['quoteChar'] = '\\"' row_format = 'serde' serde_name = 'org.apache.hadoop.hive.serde2.OpenCSVSerde' serde_properties = '''"separatorChar" = "%(fieldSeparator)s", "quoteChar" = "%(quoteChar)s", "escapeChar" = "\\\\" ''' % source['format'] if table_format in ('parquet', 'kudu'): if load_data: table_name, final_table_name = 'hue__tmp_%s' % table_name, table_name sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } else: # Manual row_format = '' file_format = table_format skip_header = False if table_format == 'kudu': columns = [ col for col in columns if col['name'] in primary_keys ] + [ col for col in columns if col['name'] not in primary_keys ] if table_format == 'kudu': collection_delimiter = None map_delimiter = None if external or (load_data and table_format in ('parquet', 'kudu')): if not self.fs.isdir(external_path): # File selected external_path, external_file_name = self.fs.split( external_path) if len(self.fs.listdir(external_path)) > 1: external_path = external_path + '/%s_table' % external_file_name # If dir not just the file, create data dir and move file there. self.fs.mkdir(external_path) self.fs.rename(source_path, external_path) if external_path.lower().startswith( "abfs"): #this is to check if its using an ABFS path external_path = abfspath(external_path) sql += django_mako.render_to_string( "gen/create_table_statement.mako", { 'table': { 'name': table_name, 'comment': comment, 'row_format': row_format, 'field_terminator': field_delimiter, 'collection_terminator': collection_delimiter if source_type == 'hive' else None, 'map_key_terminator': map_delimiter if source_type == 'hive' else None, 'serde_name': serde_name, 'serde_properties': serde_properties, 'file_format': file_format, 'external': external or load_data and table_format in ('parquet', 'kudu'), 'path': external_path, 'skip_header': skip_header, 'primary_keys': primary_keys if table_format == 'kudu' and not load_data else [], }, 'columns': columns, 'partition_columns': partition_columns, 'kudu_partition_columns': kudu_partition_columns, 'database': database }) if table_format in ('text', 'json', 'csv', 'regexp') and not external and load_data: form_data = { 'path': source_path, 'overwrite': False, 'partition_columns': [(partition['name'], partition['partitionValue']) for partition in partition_columns], } query_server_config = dbms.get_query_server_config( name=source_type) db = dbms.get(self.user, query_server=query_server_config) sql += "\n\n%s;" % db.load_data( database, table_name, form_data, None, generate_ddl_only=True) if load_data and table_format in ('parquet', 'kudu'): file_format = table_format if table_format == 'kudu': columns_list = [ '`%s`' % col for col in primary_keys + [ col['name'] for col in destination['columns'] if col['name'] not in primary_keys and col['keep'] ] ] extra_create_properties = """PRIMARY KEY (%(primary_keys)s) PARTITION BY HASH PARTITIONS 16 STORED AS %(file_format)s TBLPROPERTIES( 'kudu.num_tablet_replicas' = '1' )""" % { 'file_format': file_format, 'primary_keys': ', '.join(primary_keys) } else: columns_list = ['*'] extra_create_properties = 'STORED AS %(file_format)s' % { 'file_format': file_format } sql += '''\n\nCREATE TABLE `%(database)s`.`%(final_table_name)s`%(comment)s %(extra_create_properties)s AS SELECT %(columns_list)s FROM `%(database)s`.`%(table_name)s`;''' % { 'database': database, 'final_table_name': final_table_name, 'table_name': table_name, 'extra_create_properties': extra_create_properties, 'columns_list': ', '.join(columns_list), 'comment': ' COMMENT "%s"' % comment if comment else '' } sql += '\n\nDROP TABLE IF EXISTS `%(database)s`.`%(table_name)s`;\n' % { 'database': database, 'table_name': table_name } editor_type = 'impala' if table_format == 'kudu' else destination[ 'sourceType'] on_success_url = reverse('metastore:describe_table', kwargs={ 'database': database, 'table': final_table_name }) + '?source_type=' + source_type return make_notebook(name=_('Creating table %(database)s.%(table)s') % { 'database': database, 'table': final_table_name }, editor_type=editor_type, statement=sql.strip(), status='ready', database=database, on_success_url=on_success_url, last_executed=start_time, is_task=True)