def _get_loader(self, app): if app in self.loaders: return self.loaders[app] # Lazily find a temp dir for module_dir. # This laziness is important because at initialization time # we might still be running as root during desktop startup # and thus the temp dir would be owned as root, not the # unpriveleged user! if self.module_dir is None: self.module_dir = tempfile.mkdtemp() # TODO(todd) configurable? app_module = __import__(app) app_dir = os.path.dirname(app_module.__file__) app_template_dir = os.path.join(app_dir, 'templates') loader = TemplateLookup( directories=[app_template_dir, self.desktop_template_dir], module_directory=os.path.join(self.module_dir, app), output_encoding=i18n.get_site_encoding(), input_encoding=i18n.get_site_encoding(), encoding_errors=ENCODING_ERRORS, default_filters=['unicode', 'escape'], imports=IMPORTS) # TODO(philip): Make a django_aware default filter, that understands # django safe strings. See http://www.makotemplates.org/docs/filtering.html. self.loaders[app] = loader return loader
def _get_loader(self, app): if app in self.loaders: return self.loaders[app] # Lazily find a temp dir for module_dir. # This laziness is important because at initialization time # we might still be running as root during desktop startup # and thus the temp dir would be owned as root, not the # unpriveleged user! if self.module_dir is None: self.module_dir = tempfile.mkdtemp() # TODO(todd) configurable? app_module = __import__(app) app_dir = os.path.dirname(app_module.__file__) app_template_dir = os.path.join(app_dir, 'templates') loader = TemplateLookup(directories=[app_template_dir, self.desktop_template_dir], module_directory=os.path.join(self.module_dir, app), output_encoding=i18n.get_site_encoding(), input_encoding=i18n.get_site_encoding(), encoding_errors=ENCODING_ERRORS, default_filters=['unicode', 'escape'], imports=IMPORTS) # TODO(philip): Make a django_aware default filter, that understands # django safe strings. See http://www.makotemplates.org/docs/filtering.html. self.loaders[app] = loader return loader
def __init__(self, encoding=None): super(XLSformatter, self).__init__() self._encoding = encoding or i18n.get_site_encoding() self._book = xl.Workbook() self._sheet = self._book.add_sheet("Sheet 1") self._row = 0 self._size = 0
def __init__(self, encoding=None): super(XLSformatter, self).__init__() self._encoding = encoding or i18n.get_site_encoding() self._book = Workbook(optimized_write = True, encoding = 'utf-8') self._sheet = self._book.create_sheet(title='RESULT') self._row = 0 self._size = 0
def __init__(self, encoding=None): super(CSVformatter, self).__init__() dialect = csv.excel() dialect.quoting = csv.QUOTE_ALL self._encoding = encoding or i18n.get_site_encoding() self._csv_writer = csv.writer(self, dialect=dialect) self._line = None
def format(row, encoding=None): return [ smart_str(nullify(cell), encoding or i18n.get_site_encoding(), strings_only=True, errors='replace') for cell in row ]
def encode_row(row, encoding=None, is_xls=False): encoded_row = [] for cell in row: if is_xls and isinstance(cell, str): cell = re.sub(XLS_ILLEGAL_CHARS, '?', cell) cell = smart_str(nullify(cell), encoding or i18n.get_site_encoding(), strings_only=True, errors='replace') encoded_row.append(cell) return encoded_row
def _get_sample(cls, file_stream): encoding = i18n.get_site_encoding() for reader in [TextFileReader, GzipFileReader]: file_stream.seek(0) sample_data, sample_lines = reader.readlines(file_stream, encoding) file_stream.seek(0) if sample_data is not None: yield sample_data, sample_lines
def encode_row(row, encoding=None): encoded_row = [] for cell in row: if isinstance(cell, six.string_types): cell = re.sub(ILLEGAL_CHARS, '?', cell) cell = nullify(cell) if not isinstance(cell, numbers.Number): cell = smart_str(cell, encoding or i18n.get_site_encoding(), strings_only=True, errors='replace') encoded_row.append(cell) return encoded_row
def _get_sample(cls, file_stream): encoding = i18n.get_site_encoding() for reader in [TextFileReader, GzipFileReader]: file_stream.seek(0) lines = reader.readlines(file_stream, encoding) file_stream.seek(0) if lines is not None: yield '\n'.join(lines)
def encode_row(row, encoding=None, make_excel_links=False): encoded_row = [] for cell in row: if isinstance(cell, six.string_types): cell = re.sub(ILLEGAL_CHARS, '?', cell) if make_excel_links: cell = re.compile('(https?://.+)', re.IGNORECASE).sub(r'=HYPERLINK("\1")', cell) cell = nullify(cell) if not isinstance(cell, numbers.Number): cell = smart_str(cell, encoding or i18n.get_site_encoding(), strings_only=True, errors='replace') encoded_row.append(cell) return encoded_row
def test_remove_header(self): fs = self.cluster.fs path = "/tmp/test_remove_header.txt" data_header = "destination\trank" data_body = """thailand\t10 costarica\t? curacao\t?""" data = data_header + '\n' + data_body f = fs.open(path, "w") f.write("hello") f.close() encoding = i18n.get_site_encoding() do_overwrite_save(fs, path, data.encode(encoding)) assert_not_equal(data_body, fs.open(path).read()) remove_header(fs, path) assert_equals(data_body, fs.open(path).read())
def import_wizard(request, database='default'): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) db = dbms.get(request.user) dbs = db.get_databases() databases = [{'name':db, 'url':reverse('beeswax:import_wizard', kwargs={'database': db})} for db in dbs] if request.method == 'POST': # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get('submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get('submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get('cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get('cancel_create') # Step 3 -> 2 # Exactly one of these should be True if len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) != 1: raise PopupException(_('Invalid form submission')) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],), (s2_delim_form.cleaned_data['delimiter'],)) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render('choose_delimiter.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, 'databases': databases }) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append({ 'column_name': 'col_%s' % (i,), 'column_type': 'string', }) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) try: fields_list_for_json = list(fields_list) if fields_list_for_json: fields_list_for_json[0] = map(lambda a: re.sub('[^\w]', '', a), fields_list_for_json[0]) # Cleaning headers return render('define_columns.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'fields_list_json': json.dumps(fields_list_for_json), 'n_cols': n_cols, 'database': database, 'databases': databases }) except Exception, e: raise PopupException(_("The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns."), detail=e) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string("create_table_statement.mako", { 'table': { 'name': table_name, 'comment': s1_file_form.cleaned_data['comment'], 'row_format': 'Delimited', 'field_terminator': delim }, 'columns': [ f.cleaned_data for f in s3_col_formset.forms ], 'partition_columns': [], 'database': database, 'databases': databases } ) do_load_data = s1_file_form.cleaned_data.get('do_import') path = s1_file_form.cleaned_data['path'] try: return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database) except QueryServerException, e: raise PopupException(_('The table could not be created.'), detail=e.message)
def format(row, encoding=None): return [smart_str(nullify(cell), encoding or i18n.get_site_encoding(), strings_only=True, errors='replace') for cell in row]
def clean_encoding(self): encoding = self.cleaned_data.get("encoding", "").strip() if not encoding: return i18n.get_site_encoding() return encoding
def import_wizard(request, database=None): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ database = _get_last_database(request, database) encoding = i18n.get_site_encoding() app_name = get_app_name(request) if request.method == 'POST': # Have a while loop to allow an easy way to break for _ in range(1): # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [ [] ], 0 s3_col_formset = None # Everything requires a valid file form db = dbms.get(request.user) s1_file_form = CreateByImportFileForm(request.POST, db=db) if not s1_file_form.is_valid(): break do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get('submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get('submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get('cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get('cancel_create') # Step 3 -> 2 # Exactly one of these should be True assert len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) == 1, 'Invalid form submission' # # Fix up what we should do in case any form is invalid # if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [ reader.TYPE for reader in FILE_READERS ], DELIMITERS, False) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],), (s2_delim_form.cleaned_data['delimiter'],), s2_delim_form.cleaned_data.get('read_column_headers')) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render('choose_delimiter.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, }) # # Go to step 3: Define column. # if do_s3_column_def: read_column_headers = s2_delim_form.cleaned_data.get('read_column_headers') if s3_col_formset is None or not read_column_headers: columns = [] if read_column_headers and fields_list: first_row = fields_list[0] for i in range(n_cols): columns.append(dict( column_name=first_row[i] if i < len(first_row) else 'col_%s' % (i + 1,), column_type='string', )) fields_list = fields_list[1:] else: for i in range(n_cols): columns.append(dict( column_name='col_%s' % (i + 1,), column_type='string', )) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) return render('define_columns.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'n_cols': n_cols, 'database': database, }) # # Finale: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string("create_table_statement.mako", { 'table': dict(name=table_name, comment=s1_file_form.cleaned_data['comment'], row_format='Delimited', field_terminator=delim), 'columns': [ f.cleaned_data for f in s3_col_formset.forms ], 'partition_columns': [], 'database': database, } ) do_load_data = s1_file_form.cleaned_data.get('do_import') path = s1_file_form.cleaned_data.get('path') read_column_headers = s2_delim_form.cleaned_data.get('read_column_headers') if read_column_headers and do_load_data: file_type = s2_delim_form.cleaned_data.get('file_type') file_readers = [ reader for reader in FILE_READERS if reader.TYPE == file_type ] if len(file_readers) == 1: try: file_obj = request.fs.open(path) _skip_first_line_in_file(file_readers[0], request.fs, path, file_readers[0].find(file_obj, '\n') + 1) except Exception, ex: msg = _('Cannot process file: %s' % (ex,)) LOG.error(msg) raise PopupException(msg) return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database)
def display(request, path): """ Implements displaying part of a file. GET arguments are length, offset, mode, compression and encoding with reasonable defaults chosen. Note that display by length and offset are on bytes, not on characters. TODO(philip): Could easily built-in file type detection (perhaps using something similar to file(1)), as well as more advanced binary-file viewing capability (de-serialize sequence files, decompress gzipped text files, etc.). There exists a python-magic package to interface with libmagic. """ path = _unquote_path(path) if not request.fs.isfile(path): raise PopupException("Not a file: '%s'" % (path,)) stats = request.fs.stats(path) encoding = request.GET.get('encoding') or i18n.get_site_encoding() # I'm mixing URL-based parameters and traditional # HTTP GET parameters, since URL-based parameters # can't naturally be optional. # Need to deal with possibility that length is not present # because the offset came in via the toolbar manual byte entry. end = request.GET.get("end") if end: end = int(end) begin = request.GET.get("begin", 1) if begin: # Subtract one to zero index for file read begin = int(begin) - 1 if end: offset = begin length = end - begin if begin >= end: raise PopupException("First byte to display must be before last byte to display.") else: length = int(request.GET.get("length", DEFAULT_CHUNK_SIZE_BYTES)) # Display first block by default. offset = int(request.GET.get("offset", 0)) mode = request.GET.get("mode") compression = request.GET.get("compression") if mode and mode not in ["binary", "text"]: raise PopupException("Mode must be one of 'binary' or 'text'.") if offset < 0: raise PopupException("Offset may not be less than zero.") if length < 0: raise PopupException("Length may not be less than zero.") if length > MAX_CHUNK_SIZE_BYTES: raise PopupException("Cannot request chunks greater than %d bytes" % MAX_CHUNK_SIZE_BYTES) # Auto gzip detection, unless we are explicitly told to view binary if not compression and mode != 'binary': if path.endswith('.gz') and detect_gzip(request.fs.open(path).read(2)): compression = 'gzip' offset = 0 else: compression = 'none' f = request.fs.open(path) if compression == 'gzip': if offset and offset != 0: raise PopupException("We don't support offset and gzip Compression") try: try: contents = GzipFile('', 'r', 0, StringIO(f.read())).read(length) except: logging.warn("Could not decompress file at %s" % path, exc_info=True) contents = '' raise PopupException("Failed to decompress file") finally: f.close() else: try: f.seek(offset) contents = f.read(length) finally: f.close() # Get contents as string for text mode, or at least try uni_contents = None if not mode or mode == 'text': uni_contents = unicode(contents, encoding, errors='replace') is_binary = uni_contents.find(i18n.REPLACEMENT_CHAR) != -1 # Auto-detect mode if not mode: mode = is_binary and 'binary' or 'text' # Get contents as bytes if mode == "binary": xxd_out = list(xxd.xxd(offset, contents, BYTES_PER_LINE, BYTES_PER_SENTENCE)) dirname = posixpath.dirname(path) # Start with index-like data: data = _massage_stats(request, request.fs.stats(path)) # And add a view structure: data["success"] = True data["view"] = { 'offset': offset, 'length': length, 'end': offset + len(contents), 'dirname': dirname, 'mode': mode, 'compression': compression, 'size': stats['size'] } data["filename"] = os.path.basename(path) data["editable"] = stats['size'] < MAX_FILEEDITOR_SIZE if mode == "binary": # This might be the wrong thing for ?format=json; doing the # xxd'ing in javascript might be more compact, or sending a less # intermediate representation... logger.debug("xxd: " + str(xxd_out)) data['view']['xxd'] = xxd_out data['view']['masked_binary_data'] = False else: data['view']['contents'] = uni_contents data['view']['masked_binary_data'] = is_binary return render_with_toolbars("display.mako", request, data)
def import_wizard(request): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() if request.method == 'POST': # Have a while loop to allow an easy way to break for _ in range(1): # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [ [] ], 0 s3_col_formset = None # Everything requires a valid file form s1_file_form = beeswax.forms.CreateByImportFileForm(request.POST) if not s1_file_form.is_valid(): break do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get('submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get('submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get('cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get('cancel_create') # Step 3 -> 2 # Exactly one of these should be True assert len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) == 1, 'Invalid form submission' # # Fix up what we should do in case any form is invalid # if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = beeswax.forms.CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = beeswax.forms.ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [ reader.TYPE for reader in FILE_READERS ], DELIMITERS) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],), (s2_delim_form.cleaned_data['delimiter'],)) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render('choose_delimiter.mako', request, dict( action=urlresolvers.reverse(import_wizard), delim_readable=DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), initial=delim_is_auto, file_form=s1_file_form, delim_form=s2_delim_form, fields_list=fields_list, delimiter_choices=beeswax.forms.TERMINATOR_CHOICES, n_cols=n_cols, )) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append(dict( column_name='col_%s' % (i,), column_type='string', )) s3_col_formset = beeswax.forms.ColumnTypeFormSet(prefix='cols', initial=columns) return render('define_columns.mako', request, dict( action=urlresolvers.reverse(import_wizard), file_form=s1_file_form, delim_form=s2_delim_form, column_formset=s3_col_formset, fields_list=fields_list, n_cols=n_cols, )) # # Finale: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string("create_table_statement.mako", { 'table': dict(name=table_name, comment=s1_file_form.cleaned_data['comment'], row_format='Delimited', field_terminator=delim), 'columns': [ f.cleaned_data for f in s3_col_formset.forms ], 'partition_columns': [] } ) do_load_data = s1_file_form.cleaned_data.get('do_import') path = s1_file_form.cleaned_data['path'] return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data) else: s1_file_form = beeswax.forms.CreateByImportFileForm() return render('choose_file.mako', request, dict( action=urlresolvers.reverse(import_wizard), file_form=s1_file_form, ))
def display(request, path): """ Implements displaying part of a file. GET arguments are length, offset, mode, compression and encoding with reasonable defaults chosen. Note that display by length and offset are on bytes, not on characters. TODO(philip): Could easily built-in file type detection (perhaps using something similar to file(1)), as well as more advanced binary-file viewing capability (de-serialize sequence files, decompress gzipped text files, etc.). There exists a python-magic package to interface with libmagic. """ path = _unquote_path(path) if not request.fs.isfile(path): raise PopupException("Not a file: '%s'" % (path, )) stats = request.fs.stats(path) encoding = request.GET.get('encoding') or i18n.get_site_encoding() # I'm mixing URL-based parameters and traditional # HTTP GET parameters, since URL-based parameters # can't naturally be optional. # Need to deal with possibility that length is not present # because the offset came in via the toolbar manual byte entry. end = request.GET.get("end") if end: end = int(end) begin = request.GET.get("begin", 1) if begin: # Subtract one to zero index for file read begin = int(begin) - 1 if end: offset = begin length = end - begin if begin >= end: raise PopupException( "First byte to display must be before last byte to display.") else: length = int(request.GET.get("length", DEFAULT_CHUNK_SIZE_BYTES)) # Display first block by default. offset = int(request.GET.get("offset", 0)) mode = request.GET.get("mode") compression = request.GET.get("compression") if mode and mode not in ["binary", "text"]: raise PopupException("Mode must be one of 'binary' or 'text'.") if offset < 0: raise PopupException("Offset may not be less than zero.") if length < 0: raise PopupException("Length may not be less than zero.") if length > MAX_CHUNK_SIZE_BYTES: raise PopupException("Cannot request chunks greater than %d bytes" % MAX_CHUNK_SIZE_BYTES) # Auto gzip detection, unless we are explicitly told to view binary if not compression and mode != 'binary': if path.endswith('.gz') and detect_gzip(request.fs.open(path).read(2)): compression = 'gzip' offset = 0 else: compression = 'none' f = request.fs.open(path) if compression == 'gzip': if offset and offset != 0: raise PopupException( "We don't support offset and gzip Compression") try: try: contents = GzipFile('', 'r', 0, StringIO(f.read())).read(length) except: logging.warn("Could not decompress file at %s" % path, exc_info=True) contents = '' raise PopupException("Failed to decompress file") finally: f.close() else: try: f.seek(offset) contents = f.read(length) finally: f.close() # Get contents as string for text mode, or at least try uni_contents = None if not mode or mode == 'text': uni_contents = unicode(contents, encoding, errors='replace') is_binary = uni_contents.find(i18n.REPLACEMENT_CHAR) != -1 # Auto-detect mode if not mode: mode = is_binary and 'binary' or 'text' # Get contents as bytes if mode == "binary": xxd_out = list( xxd.xxd(offset, contents, BYTES_PER_LINE, BYTES_PER_SENTENCE)) dirname = posixpath.dirname(path) # Start with index-like data: data = _massage_stats(request, request.fs.stats(path)) # And add a view structure: data["success"] = True data["view"] = { 'offset': offset, 'length': length, 'end': offset + len(contents), 'dirname': dirname, 'mode': mode, 'compression': compression, 'size': stats['size'] } data["filename"] = os.path.basename(path) data["editable"] = stats['size'] < MAX_FILEEDITOR_SIZE if mode == "binary": # This might be the wrong thing for ?format=json; doing the # xxd'ing in javascript might be more compact, or sending a less # intermediate representation... logger.debug("xxd: " + str(xxd_out)) data['view']['xxd'] = xxd_out data['view']['masked_binary_data'] = False else: data['view']['contents'] = uni_contents data['view']['masked_binary_data'] = is_binary return render_with_toolbars("display.mako", request, data)
# A file not found is OK, otherwise re-raise if ioe.errno == errno.ENOENT: stats = None else: raise # Can't edit a directory if stats and stats["mode"] & stat_module.S_IFDIR: raise PopupException(_("Cannot edit a directory: %(path)s") % {"path": path}) # Maximum size of edit if stats and stats["size"] > MAX_FILEEDITOR_SIZE: raise PopupException(_("File too big to edit: %(path)s") % {"path": path}) if not form: encoding = request.REQUEST.get("encoding") or i18n.get_site_encoding() if stats: f = request.fs.open(path) try: try: current_contents = unicode(f.read(), encoding) except UnicodeDecodeError: raise PopupException( _("File is not encoded in %(encoding)s; cannot be edited: %(path)s") % {"encoding": encoding, "path": path} ) finally: f.close() else: current_contents = u""
def import_wizard(request, database='default'): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) db = dbms.get(request.user) dbs = db.get_databases() databases = [{ 'name': db, 'url': reverse('beeswax:import_wizard', kwargs={'database': db}) } for db in dbs] if request.method == 'POST': # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get( 'submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get( 'submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get( 'cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get( 'cancel_create') # Step 3 -> 2 # Exactly one of these should be True if len([ _f for _f in (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def) if _f ]) != 1: raise PopupException(_('Invalid form submission')) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False load_data = s1_file_form.cleaned_data.get('load_data', 'IMPORT').upper() path = s1_file_form.cleaned_data['path'] # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: try: if load_data == 'IMPORT': if not request.fs.isfile(path): raise PopupException( _('Path location must refer to a file if "Import Data" is selected.' )) elif load_data == 'EXTERNAL': if not request.fs.isdir(path): raise PopupException( _('Path location must refer to a directory if "Create External Table" is selected.' )) except (IOError, S3FileSystemException) as e: raise PopupException( _('Path location "%s" is invalid: %s') % (path, e)) delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'], ), (s2_delim_form.cleaned_data['delimiter'], )) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: apps_list = _get_apps(request.user, '') return render( 'import_wizard_choose_delimiter.mako', request, { 'apps': apps_list, 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get( s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, 'databases': databases }) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append({ 'column_name': 'col_%s' % (i, ), 'column_type': 'string', }) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) try: fields_list_for_json = list(fields_list) if fields_list_for_json: fields_list_for_json[0] = [ re.sub('[^\w]', '', a) for a in fields_list_for_json[0] ] # Cleaning headers apps_list = _get_apps(request.user, '') return render( 'import_wizard_define_columns.mako', request, { 'apps': apps_list, 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'fields_list_json': json.dumps(fields_list_for_json), 'n_cols': n_cols, 'database': database, 'databases': databases }) except Exception as e: raise PopupException(_( "The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns." ), detail=e) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string( "create_table_statement.mako", { 'table': { 'name': table_name, 'comment': s1_file_form.cleaned_data['comment'], 'row_format': 'Delimited', 'field_terminator': delim, 'file_format': 'TextFile', 'load_data': load_data, 'path': path, 'skip_header': request.GET.get('removeHeader', 'off').lower() == 'on' }, 'columns': [f.cleaned_data for f in s3_col_formset.forms], 'partition_columns': [], 'database': database, 'databases': databases }) try: return _submit_create_and_load(request, proposed_query, table_name, path, load_data, database=database) except QueryServerException as e: raise PopupException(_('The table could not be created.'), detail=e.message) else: s1_file_form = CreateByImportFileForm() return render( 'import_wizard_choose_file.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database }), 'file_form': s1_file_form, 'database': database, 'databases': databases })
def import_wizard(request, database="default"): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) db = dbms.get(request.user) dbs = db.get_databases() databases = [{"name": db, "url": reverse("beeswax:import_wizard", kwargs={"database": db})} for db in dbs] if request.method == "POST": # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get("submit_file") # Step 1 -> 2 do_s2_user_delim = request.POST.get("submit_preview") # Step 2 -> 2 do_s3_column_def = request.POST.get("submit_delim") # Step 2 -> 3 do_hive_create = request.POST.get("submit_create") # Step 3 -> execute cancel_s2_user_delim = request.POST.get("cancel_delim") # Step 2 -> 1 cancel_s3_column_def = request.POST.get("cancel_create") # Step 3 -> 2 # Exactly one of these should be True if ( len( filter( None, ( do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def, ), ) ) != 1 ): raise PopupException(_("Invalid form submission")) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix="cols", data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS ) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data["file_type"],), (s2_delim_form.cleaned_data["delimiter"],), ) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render( "choose_delimiter.mako", request, { "action": reverse(app_name + ":import_wizard", kwargs={"database": database}), "delim_readable": DELIMITER_READABLE.get( s2_delim_form["delimiter"].data[0], s2_delim_form["delimiter"].data[1] ), "initial": delim_is_auto, "file_form": s1_file_form, "delim_form": s2_delim_form, "fields_list": fields_list, "delimiter_choices": TERMINATOR_CHOICES, "n_cols": n_cols, "database": database, "databases": databases, }, ) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append({"column_name": "col_%s" % (i,), "column_type": "string"}) s3_col_formset = ColumnTypeFormSet(prefix="cols", initial=columns) try: fields_list_for_json = list(fields_list) if fields_list_for_json: fields_list_for_json[0] = map( lambda a: re.sub("[^\w]", "", a), fields_list_for_json[0] ) # Cleaning headers return render( "define_columns.mako", request, { "action": reverse(app_name + ":import_wizard", kwargs={"database": database}), "file_form": s1_file_form, "delim_form": s2_delim_form, "column_formset": s3_col_formset, "fields_list": fields_list, "fields_list_json": json.dumps(fields_list_for_json), "n_cols": n_cols, "database": database, "databases": databases, }, ) except Exception, e: raise PopupException( _( "The selected delimiter is creating an un-even number of columns. Please make sure you don't have empty columns." ), detail=e, ) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data["delimiter"] table_name = s1_file_form.cleaned_data["name"] proposed_query = django_mako.render_to_string( "create_table_statement.mako", { "table": { "name": table_name, "comment": s1_file_form.cleaned_data["comment"], "row_format": "Delimited", "field_terminator": delim, }, "columns": [f.cleaned_data for f in s3_col_formset.forms], "partition_columns": [], "database": database, "databases": databases, }, ) do_load_data = s1_file_form.cleaned_data.get("do_import") path = s1_file_form.cleaned_data["path"] return _submit_create_and_load( request, proposed_query, table_name, path, do_load_data, database=database )
# A file not found is OK, otherwise re-raise if ioe.errno == errno.ENOENT: stats = None else: raise # Can't edit a directory if stats and stats['mode'] & stat_module.S_IFDIR: raise PopupException(_("Cannot edit a directory: %(path)s") % {'path': path}) # Maximum size of edit if stats and stats['size'] > MAX_FILEEDITOR_SIZE: raise PopupException(_("File too big to edit: %(path)s") % {'path': path}) if not form: encoding = request.REQUEST.get('encoding') or i18n.get_site_encoding() if stats: f = request.fs.open(path) try: try: current_contents = unicode(f.read(), encoding) except UnicodeDecodeError: raise PopupException(_("File is not encoded in %(encoding)s; cannot be edited: %(path)s.") % {'encoding': encoding, 'path': path}) finally: f.close() else: current_contents = u"" form = EditorForm(dict(path=path, contents=current_contents, encoding=encoding)) data = dict(
def run_bin_hadoop_step(self, step): """ user.name is used by FileSystem.getHomeDirectory(). The environment variables for _USER and _GROUPS are used by the aspectj aspect to overwrite Hadoop's notion of users and groups. """ java_properties = {} java_properties["hue.suffix"] = "-via-hue" java_properties["user.name"] = self.plan.user java_prop_str = " ".join("-D%s=%s" % (k,v) for k, v in java_properties.iteritems()) env = { 'HADOOP_HOME': hadoop.conf.HADOOP_HOME.get(), 'HADOOP_OPTS': "-javaagent:%s %s" % (jobsub.conf.ASPECTJWEAVER.get(), java_prop_str), 'HADOOP_CLASSPATH': ':'.join([jobsub.conf.ASPECTPATH.get(), hadoop.conf.HADOOP_EXTRA_CLASSPATH_STRING.get()]), 'HUE_JOBTRACE_LOG': self.internal_file_name("jobs"), 'HUE_JOBSUB_USER': self.plan.user, 'HUE_JOBSUB_GROUPS': ",".join(self.plan.groups), 'LANG': os.getenv('LANG', i18n.get_site_encoding()), } all_clusters = [] all_clusters += all_mrclusters().values() all_clusters += get_all_hdfs().values() delegation_token_files = [] merged_token_file = tempfile.NamedTemporaryFile() try: LOG.debug("all_clusters: %s" % (repr(all_clusters),)) for cluster in all_clusters: if cluster.security_enabled: cluster.setuser(self.plan.user) token = cluster.get_delegation_token() token_file = tempfile.NamedTemporaryFile() token_file.write(token.delegationTokenBytes) token_file.flush() delegation_token_files.append(token_file) java_home = os.getenv('JAVA_HOME') if java_home: env["JAVA_HOME"] = java_home for k, v in env.iteritems(): assert v is not None, "Environment key %s missing value." % k base_args = [ hadoop.conf.HADOOP_BIN.get() ] if hadoop.conf.HADOOP_CONF_DIR.get(): base_args.append("--config") base_args.append(hadoop.conf.HADOOP_CONF_DIR.get()) if delegation_token_files: args = list(base_args) # Make a copy of the base args. args += ['jar', hadoop.conf.CREDENTIALS_MERGER_JAR.get(), merged_token_file.name] args += [token_file.name for token_file in delegation_token_files] LOG.debug("merging credentials files with comand: '%s'" % (' '.join(args),)) merge_pipe = subprocess.Popen(args, shell=False, close_fds=True) retcode = merge_pipe.wait() if 0 != retcode: raise Exception("bin/hadoop returned non-zero %d while trying to merge credentials" % (retcode,)) env['HADOOP_TOKEN_FILE_LOCATION'] = merged_token_file.name args = list(base_args) # Make a copy of the base args. args += step.arguments LOG.info("Starting %s. (Env: %s)", repr(args), repr(env)) LOG.info("Running: %s" % " ".join(args)) self.pipe = subprocess.Popen( args, stdin=None, cwd=self.work_dir, stdout=self.stdout, stderr=self.stderr, shell=False, close_fds=True, env=env) retcode = self.pipe.wait() if 0 != retcode: raise Exception("bin/hadoop returned non-zero %d" % retcode) LOG.info("bin/hadoop returned %d" % retcode) finally: for token_file in delegation_token_files + [merged_token_file]: token_file.close()
def import_wizard(request): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() if request.method == 'POST': # Have a while loop to allow an easy way to break for _ in range(1): # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None # Everything requires a valid file form s1_file_form = beeswax.forms.CreateByImportFileForm(request.POST) if not s1_file_form.is_valid(): break do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get( 'submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get( 'submit_create') # Step 3 -> execute # Exactly one of these should be True assert len( filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create))) == 1, 'Invalid form submission' # # Fix up what we should do in case any form is invalid # if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = beeswax.forms.CreateByImportDelimForm( request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = beeswax.forms.ColumnTypeFormSet( prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS) if (do_s2_user_delim or do_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview( request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'], ), (s2_delim_form.cleaned_data['delimiter'], )) if do_s2_auto_delim or do_s2_user_delim: return render( 'choose_delimiter.mako', request, dict( action=urlresolvers.reverse(import_wizard), delim_readable=DELIMITER_READABLE[ s2_delim_form['delimiter'].data[0]], initial=delim_is_auto, file_form=s1_file_form, delim_form=s2_delim_form, fields_list=fields_list, delimiter_choices=beeswax.forms.TERMINATOR_CHOICES, n_cols=n_cols, )) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append( dict( column_name='col_%s' % (i, ), column_type='string', )) s3_col_formset = beeswax.forms.ColumnTypeFormSet( prefix='cols', initial=columns) return render( 'define_columns.mako', request, dict( action=urlresolvers.reverse(import_wizard), file_form=s1_file_form, delim_form=s2_delim_form, column_formset=s3_col_formset, fields_list=fields_list, n_cols=n_cols, )) # # Finale: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string( "create_table_statement.mako", { 'table': dict(name=table_name, comment=s1_file_form.cleaned_data['comment'], row_format='Delimited', field_terminator=delim), 'columns': [f.cleaned_data for f in s3_col_formset.forms], 'partition_columns': [] }) do_load_data = s1_file_form.cleaned_data.get('do_import') path = s1_file_form.cleaned_data['path'] return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data) else: s1_file_form = beeswax.forms.CreateByImportFileForm() return render( 'choose_file.mako', request, dict( action=urlresolvers.reverse(import_wizard), file_form=s1_file_form, ))
def clean_encoding(self): encoding = self.cleaned_data.get('encoding', '').strip() if not encoding: return i18n.get_site_encoding() return encoding
def import_wizard(request, database='default'): """ Help users define table and based on a file they want to import to Hive. Limitations: - Rows are delimited (no serde). - No detection for map and array types. - No detection for the presence of column header in the first row. - No partition table. - Does not work with binary data. """ encoding = i18n.get_site_encoding() app_name = get_app_name(request) if request.method == 'POST': # # General processing logic: # - We have 3 steps. Each requires the previous. # * Step 1 : Table name and file location # * Step 2a : Display sample with auto chosen delim # * Step 2b : Display sample with user chosen delim (if user chooses one) # * Step 3 : Display sample, and define columns # - Each step is represented by a different form. The form of an earlier step # should be present when submitting to a later step. # - To preserve the data from the earlier steps, we send the forms back as # hidden fields. This way, when users revisit a previous step, the data would # be there as well. # delim_is_auto = False fields_list, n_cols = [[]], 0 s3_col_formset = None db = dbms.get(request.user) s1_file_form = CreateByImportFileForm(request.POST, db=db) if s1_file_form.is_valid(): do_s2_auto_delim = request.POST.get('submit_file') # Step 1 -> 2 do_s2_user_delim = request.POST.get('submit_preview') # Step 2 -> 2 do_s3_column_def = request.POST.get('submit_delim') # Step 2 -> 3 do_hive_create = request.POST.get('submit_create') # Step 3 -> execute cancel_s2_user_delim = request.POST.get('cancel_delim') # Step 2 -> 1 cancel_s3_column_def = request.POST.get('cancel_create') # Step 3 -> 2 # Exactly one of these should be True if len(filter(None, (do_s2_auto_delim, do_s2_user_delim, do_s3_column_def, do_hive_create, cancel_s2_user_delim, cancel_s3_column_def))) != 1: raise PopupException(_('Invalid form submission')) if not do_s2_auto_delim: # We should have a valid delim form s2_delim_form = CreateByImportDelimForm(request.POST) if not s2_delim_form.is_valid(): # Go back to picking delimiter do_s2_user_delim, do_s3_column_def, do_hive_create = True, False, False if do_hive_create: # We should have a valid columns formset s3_col_formset = ColumnTypeFormSet(prefix='cols', data=request.POST) if not s3_col_formset.is_valid(): # Go back to define columns do_s3_column_def, do_hive_create = True, False # # Go to step 2: We've just picked the file. Preview it. # if do_s2_auto_delim: delim_is_auto = True fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, [reader.TYPE for reader in FILE_READERS], DELIMITERS) if (do_s2_user_delim or do_s3_column_def or cancel_s3_column_def) and s2_delim_form.is_valid(): # Delimit based on input fields_list, n_cols, s2_delim_form = _delim_preview(request.fs, s1_file_form, encoding, (s2_delim_form.cleaned_data['file_type'],), (s2_delim_form.cleaned_data['delimiter'],)) if do_s2_auto_delim or do_s2_user_delim or cancel_s3_column_def: return render('choose_delimiter.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'delim_readable': DELIMITER_READABLE.get(s2_delim_form['delimiter'].data[0], s2_delim_form['delimiter'].data[1]), 'initial': delim_is_auto, 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'fields_list': fields_list, 'delimiter_choices': TERMINATOR_CHOICES, 'n_cols': n_cols, 'database': database, }) # # Go to step 3: Define column. # if do_s3_column_def: if s3_col_formset is None: columns = [] for i in range(n_cols): columns.append(dict( column_name='col_%s' % (i,), column_type='string', )) s3_col_formset = ColumnTypeFormSet(prefix='cols', initial=columns) return render('define_columns.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'delim_form': s2_delim_form, 'column_formset': s3_col_formset, 'fields_list': fields_list, 'n_cols': n_cols, 'database': database, }) # # Final: Execute # if do_hive_create: delim = s2_delim_form.cleaned_data['delimiter'] table_name = s1_file_form.cleaned_data['name'] proposed_query = django_mako.render_to_string("create_table_statement.mako", { 'table': dict(name=table_name, comment=s1_file_form.cleaned_data['comment'], row_format='Delimited', field_terminator=delim), 'columns': [ f.cleaned_data for f in s3_col_formset.forms ], 'partition_columns': [], 'database': database, } ) do_load_data = s1_file_form.cleaned_data.get('do_import') path = s1_file_form.cleaned_data['path'] return _submit_create_and_load(request, proposed_query, table_name, path, do_load_data, database=database) else: s1_file_form = CreateByImportFileForm() return render('choose_file.mako', request, { 'action': reverse(app_name + ':import_wizard', kwargs={'database': database}), 'file_form': s1_file_form, 'database': database, })
def display(request, path): """ Implements displaying part of a file. GET arguments are length, offset, mode, compression and encoding with reasonable defaults chosen. Note that display by length and offset are on bytes, not on characters. TODO(philip): Could easily built-in file type detection (perhaps using something similar to file(1)), as well as more advanced binary-file viewing capability (de-serialize sequence files, decompress gzipped text files, etc.). There exists a python-magic package to interface with libmagic. """ if not request.fs.isfile(path): raise PopupException(_("Not a file: '%(path)s'") % {'path': path}) mimetype = mimetypes.guess_type(path)[0] if mimetype is not None and INLINE_DISPLAY_MIMETYPE.search(mimetype): path_enc = urlencode(path) return redirect(reverse('filebrowser.views.download', args=[path_enc]) + '?disposition=inline') stats = request.fs.stats(path) encoding = request.GET.get('encoding') or i18n.get_site_encoding() # I'm mixing URL-based parameters and traditional # HTTP GET parameters, since URL-based parameters # can't naturally be optional. # Need to deal with possibility that length is not present # because the offset came in via the toolbar manual byte entry. end = request.GET.get("end") if end: end = int(end) begin = request.GET.get("begin", 1) if begin: # Subtract one to zero index for file read begin = int(begin) - 1 if end: offset = begin length = end - begin if begin >= end: raise PopupException(_("First byte to display must be before last byte to display.")) else: length = int(request.GET.get("length", DEFAULT_CHUNK_SIZE_BYTES)) # Display first block by default. offset = int(request.GET.get("offset", 0)) mode = request.GET.get("mode") compression = request.GET.get("compression") if mode and mode not in ["binary", "text"]: raise PopupException(_("Mode must be one of 'binary' or 'text'.")) if offset < 0: raise PopupException(_("Offset may not be less than zero.")) if length < 0: raise PopupException(_("Length may not be less than zero.")) if length > MAX_CHUNK_SIZE_BYTES: raise PopupException(_("Cannot request chunks greater than %(bytes)d bytes.") % {'bytes': MAX_CHUNK_SIZE_BYTES}) # Do not decompress in binary mode. if mode == 'binary': compression = 'none' # Read out based on meta. compression, offset, length, contents =\ read_contents(compression, path, request.fs, offset, length) # Get contents as string for text mode, or at least try uni_contents = None if not mode or mode == 'text': uni_contents = unicode(contents, encoding, errors='replace') is_binary = uni_contents.find(i18n.REPLACEMENT_CHAR) != -1 # Auto-detect mode if not mode: mode = is_binary and 'binary' or 'text' # Get contents as bytes if mode == "binary": xxd_out = list(xxd.xxd(offset, contents, BYTES_PER_LINE, BYTES_PER_SENTENCE)) dirname = posixpath.dirname(path) # Start with index-like data: data = _massage_stats(request, request.fs.stats(path)) # And add a view structure: data["success"] = True data["view"] = { 'offset': offset, 'length': length, 'end': offset + len(contents), 'dirname': dirname, 'mode': mode, 'compression': compression, 'size': stats['size'], 'max_chunk_size': str(MAX_CHUNK_SIZE_BYTES) } data["filename"] = os.path.basename(path) data["editable"] = stats['size'] < MAX_FILEEDITOR_SIZE if mode == "binary": # This might be the wrong thing for ?format=json; doing the # xxd'ing in javascript might be more compact, or sending a less # intermediate representation... logger.debug("xxd: " + str(xxd_out)) data['view']['xxd'] = xxd_out data['view']['masked_binary_data'] = False else: data['view']['contents'] = uni_contents data['view']['masked_binary_data'] = is_binary data['breadcrumbs'] = parse_breadcrumbs(path) return render("display.mako", request, data)
def run_bin_hadoop_step(self, step): """ user.name is used by FileSystem.getHomeDirectory(). The environment variables for _USER and _GROUPS are used by the aspectj aspect to overwrite Hadoop's notion of users and groups. """ java_properties = {} java_properties["hue.suffix"] = "-via-hue" java_properties["user.name"] = self.plan.user java_prop_str = " ".join("-D%s=%s" % (k, v) for k, v in java_properties.iteritems()) env = { 'HADOOP_HOME': hadoop.conf.HADOOP_HOME.get(), 'HADOOP_OPTS': "-javaagent:%s %s" % (jobsub.conf.ASPECTJWEAVER.get(), java_prop_str), 'HADOOP_CLASSPATH': ':'.join([ jobsub.conf.ASPECTPATH.get(), hadoop.conf.HADOOP_EXTRA_CLASSPATH_STRING.get() ]), 'HUE_JOBTRACE_LOG': self.internal_file_name("jobs"), 'HUE_JOBSUB_USER': self.plan.user, 'HUE_JOBSUB_GROUPS': ",".join(self.plan.groups), 'LANG': os.getenv('LANG', i18n.get_site_encoding()), } delegation_token_files = [] all_clusters = [] all_clusters += all_mrclusters().values() all_clusters += get_all_hdfs().values() LOG.info("all_clusters: %s" % (repr(all_clusters), )) for cluster in all_clusters: if cluster.security_enabled: cluster.setuser(self.plan.user) token = cluster.get_delegation_token() token_file = tempfile.NamedTemporaryFile() token_file.write(token.delegationTokenBytes) token_file.flush() delegation_token_files.append(token_file) if delegation_token_files: env['HADOOP_TOKEN_FILE_LOCATION'] = ','.join( [token_file.name for token_file in delegation_token_files]) java_home = os.getenv('JAVA_HOME') if java_home: env["JAVA_HOME"] = java_home for k, v in env.iteritems(): assert v is not None, "Environment key %s missing value." % k args = [hadoop.conf.HADOOP_BIN.get()] if hadoop.conf.HADOOP_CONF_DIR.get(): args.append("--config") args.append(hadoop.conf.HADOOP_CONF_DIR.get()) args += step.arguments LOG.info("Starting %s. (Env: %s)", repr(args), repr(env)) LOG.info("Running: %s" % " ".join(args)) self.pipe = subprocess.Popen(args, stdin=None, cwd=self.work_dir, stdout=self.stdout, stderr=self.stderr, shell=False, close_fds=True, env=env) retcode = self.pipe.wait() if 0 != retcode: raise Exception("bin/hadoop returned non-zero %d" % retcode) LOG.info("bin/hadoop returned %d" % retcode) for token_file in delegation_token_files: token_file.close()