def import_wizard(request, database='default'): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) if request.method == 'POST': # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None db = dbms.get(request.user) s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get('submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get('submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get('cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get('cancel_create') # Step 3 -> 2 # Exactly one of these should be True if len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) != 1: raise PopupException(_('Invalid form submission')) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],), (s2_delim_form.cleaned_data['delimiter'],)) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render('choose_delimiter.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, }) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append(dict( column_name='col_%s' % (i,), column_type='string', )) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) return render('define_columns.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'n_cols': n_cols, 'database': database, }) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string("create_table_statement.mako", { 'table': dict(name=table_name, comment=s1_file_form.cleaned_data['comment'], row_format='Delimited', field_terminator=delim), 'columns': [ f.cleaned_data for f in s3_col_formset.forms ], 'partition_columns': [], 'database': database, } ) do_load_data = s1_file_form.cleaned_data.get('do_import') path = s1_file_form.cleaned_data['path'] return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database) else: s1_file_form = CreateByImportFileForm() return render('choose_file.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'database': database, })
def import_wizard(request, database='default'): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) db = dbms.get(request.user) dbs = db.get_databases() databases = [{ 'name': db, 'url': reverse('beeswax:import_wizard', kwargs={'database': db}) } for db in dbs] if request.method == 'POST': # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get( 'submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get( 'submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get( 'cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get( 'cancel_create') # Step 3 -> 2 # Exactly one of these should be True if len([ _f for _f in (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def) if _f ]) != 1: raise PopupException(_('Invalid form submission')) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False load_data = s1_file_form.cleaned_data.get('load_data', 'IMPORT').upper() path = s1_file_form.cleaned_data['path'] # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: try: if load_data == 'IMPORT': if not request.fs.isfile(path): raise PopupException( _('Path location must refer to a file if "Import Data" is selected.' )) elif load_data == 'EXTERNAL': if not request.fs.isdir(path): raise PopupException( _('Path location must refer to a directory if "Create External Table" is selected.' )) except (IOError, S3FileSystemException) as e: raise PopupException( _('Path location "%s" is invalid: %s') % (path, e)) delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'], ), (s2_delim_form.cleaned_data['delimiter'], )) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: apps_list = _get_apps(request.user, '') return render( 'import_wizard_choose_delimiter.mako', request, { 'apps': apps_list, 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get( s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, 'databases': databases }) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append({ 'column_name': 'col_%s' % (i, ), 'column_type': 'string', }) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) try: fields_list_for_json = list(fields_list) if fields_list_for_json: fields_list_for_json[0] = [ re.sub('[^\w]', '', a) for a in fields_list_for_json[0] ] # Cleaning headers apps_list = _get_apps(request.user, '') return render( 'import_wizard_define_columns.mako', request, { 'apps': apps_list, 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'fields_list_json': json.dumps(fields_list_for_json), 'n_cols': n_cols, 'database': database, 'databases': databases }) except Exception as e: raise PopupException(_( "The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns." ), detail=e) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string( "create_table_statement.mako", { 'table': { 'name': table_name, 'comment': s1_file_form.cleaned_data['comment'], 'row_format': 'Delimited', 'field_terminator': delim, 'file_format': 'TextFile', 'load_data': load_data, 'path': path, 'skip_header': request.GET.get('removeHeader', 'off').lower() == 'on' }, 'columns': [f.cleaned_data for f in s3_col_formset.forms], 'partition_columns': [], 'database': database, 'databases': databases }) try: return _submit_create_and_load(request, proposed_query, table_name, path, load_data, database=database) except QueryServerException as e: raise PopupException(_('The table could not be created.'), detail=e.message) else: s1_file_form = CreateByImportFileForm() return render( 'import_wizard_choose_file.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database }), 'file_form': s1_file_form, 'database': database, 'databases': databases })