def test_guess_csv_format(self): stream = StringIO.StringIO(TestIndexer.simpleCSVString) indexer = MorphlineIndexer("test", solr_client=self.solr_client) guessed_format = indexer.guess_format( {'file': { "stream": stream, "name": "test.csv" }}) fields = indexer.guess_field_types({ "file": { "stream": stream, "name": "test.csv" }, "format": guessed_format })['columns'] # test format expected_format = self.simpleCSVFormat assert_equal(expected_format, guessed_format) # test fields expected_fields = self.simpleCSVFields for expected, actual in zip(expected_fields, fields): for key in ("name", "type"): assert_equal(expected[key], actual[key])
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob']: _convert_format(source["format"], inverse=True) job_handle = _index(request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath']) else: client = SolrClient(request.user) unique_key_field = destination[ 'indexerDefaultField'] and destination['indexerDefaultField'][ 0] or None df = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None kwargs = {} stats = request.fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(request.user, request.fs) fields = indexer.get_kept_field_list(source['columns']) if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not client.exists(index_name): client.create_index(name=index_name, fields=fields, unique_key_field=unique_key_field, df=df) data = request.fs.read(source["path"], 0, MAX_UPLOAD_SIZE) client.index(name=index_name, data=data, **kwargs) job_handle = { 'status': 0, 'on_success_url': reverse('search:browse', kwargs={'name': index_name}) } elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) return JsonResponse(job_handle)
def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) if not request.fs.isfile(file_format["path"]): raise PopupException(_('Path %(path)s is not a file') % file_format) stream = request.fs.open(file_format["path"]) format_ = indexer.guess_format({ "file": { "stream": stream, "name": file_format['path'] } }) _convert_format(format_) elif file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) storage = dict([(delim['data_type'], delim['comment']) for delim in table_metadata.storage_details]) if table_metadata.details['properties']['format'] == 'text': format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage['serialization.format']} elif table_metadata.details['properties']['format'] == 'parquet': format_ = {"type": "parquet", "hasHeader": False,} else: raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001"} elif file_format['inputFormat'] == 'rdbms': format_ = RdbmsIndexer(request.user, file_format['rdbmsType']).guess_format() format_['status'] = 0 return JsonResponse(format_)
def test_generate_csv_morphline(self): indexer = MorphlineIndexer("test", solr_client=self.solr_client) morphline = indexer.generate_morphline_config("test_collection", { "columns": deepcopy(self.simpleCSVFields), "format": self.simpleCSVFormat }) assert_true(isinstance(morphline, basestring))
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': # Only support open query history # TODO get schema from explain query, which is not possible notebook = Notebook(document=Document2.objects.get(id=file_format['query'])).get_data() snippet = notebook['snippets'][0] sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True) format_ = { "sample": sample['rows'][:4], "sample_cols": sample.meta, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample.meta ] } elif file_format['inputFormat'] == 'rdbms': query_server = rdbms.get_query_server_config(server=file_format['rdbmsType']) db = rdbms.get(request.user, query_server=query_server) sample = RdbmsIndexer(request.user, file_format['rdbmsType']).get_sample_data(mode=file_format['rdbmsMode'], database=file_format['rdbmsDatabaseName'], table=file_format['rdbmsTableName']) table_metadata = db.get_columns(file_format['rdbmsDatabaseName'], file_format['rdbmsTableName'], names_only=False) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in table_metadata ] } return JsonResponse(format_)
def _test_fixed_type_format_generate_morphline(self, format_): indexer = MorphlineIndexer("test", solr_client=self.solr_client) format_instance = format_() morphline = indexer.generate_morphline_config("test_collection", { "columns": [field.to_dict() for field in format_instance.fields], "format": format_instance.get_format() }) assert_true(isinstance(morphline, basestring))
def _create_index(user, fs, client, source, destination, index_name): unique_key_field = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None df = destination['indexerDefaultField'] and destination[ 'indexerDefaultField'][0] or None kwargs = {} if source['inputFormat'] not in ('manual', 'table'): stats = fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) skip_fields = [field['name'] for field in fields if not field['keep']] kwargs['fieldnames'] = ','.join([field['name'] for field in fields]) if skip_fields: kwargs['skip'] = ','.join(skip_fields) fields = [ field for field in fields if field['name'] not in skip_fields ] if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not destination['hasHeader']: kwargs['header'] = 'false' else: kwargs['skipLines'] = 1 if not client.exists(index_name): client.create_index( name=index_name, config_name=destination.get('indexerConfigSet'), fields=fields, unique_key_field=unique_key_field, df=df, shards=destination['indexerNumShards'], replication=destination['indexerReplicationFactor']) if source['inputFormat'] not in ('manual', 'table'): data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE) try: client.index(name=index_name, data=data, **kwargs) except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2)) raise e
def _test_generate_field_operation_morphline(self, operation_format): fields = deepcopy(TestIndexer.simpleCSVFields) fields[0]['operations'].append(operation_format) indexer = MorphlineIndexer("test", solr_client=self.solr_client) morphline =indexer.generate_morphline_config("test_collection", { "columns": fields, "format": TestIndexer.simpleCSVFormat }) assert_true(isinstance(morphline, basestring))
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get('id') else file_format['query'] notebook = Notebook(document=Document2.objects.document(user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception, e: LOG.warn('Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, }
def _create_index(user, fs, client, source, destination, index_name): unique_key_field = destination['indexerDefaultField'] and destination[ 'indexerDefaultField'][0] or None df = destination['indexerPrimaryKey'] and destination['indexerPrimaryKey'][ 0] or None kwargs = {} if source['inputFormat'] != 'manual': stats = fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) skip_fields = [field['name'] for field in fields if not field['keep']] kwargs['fieldnames'] = ','.join([field['name'] for field in fields]) if skip_fields: kwargs['skip'] = ','.join(skip_fields) fields = [ field for field in fields if field['name'] not in skip_fields ] if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not destination['hasHeader']: kwargs['header'] = 'false' else: kwargs['skipLines'] = 1 if not client.exists(index_name): client.create_index( name=index_name, fields=fields, unique_key_field=unique_key_field, df=df, shards=destination['indexerNumShards'], replication=destination['indexerReplicationFactor']) if source['inputFormat'] != 'manual': data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE) client.index(name=index_name, data=data, **kwargs) return { 'status': 0, 'on_success_url': reverse('indexer:indexes', kwargs={'index': index_name}), 'pub_sub_url': 'assist.collections.refresh' }
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name) and not request.POST.get('show_command'): # if destination['isTargetExisting']: client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field # No df currently ) else: # TODO: check if format matches pass if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'stream' and file_format['streamSelection'] == 'flume': indexer = FlumeIndexer(user=request.user) if request.POST.get('show_command'): configs = indexer.generate_config(file_format, destination) return {'status': 0, 'commands': configs[-1]} else: return indexer.start(collection_name, file_format, destination) elif file_format['inputFormat'] == 'stream': return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path) elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % urllib_unquote(file_format["path"]) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline( request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path )
def test_guess_format_invalid_csv_format(self): indexer = MorphlineIndexer("test", solr_client=self.solr_client) stream = StringIO.StringIO(TestIndexer.simpleCSVString) guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) guessed_format["fieldSeparator"] = "invalid separator" fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns'] assert_equal(fields, []) stream.seek(0) guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) guessed_format["recordSeparator"] = "invalid separator" fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns'] assert_equal(fields, []) stream.seek(0) guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) guessed_format["quoteChar"] = "invalid quoteChar" fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns'] assert_equal(fields, [])
def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': path = urllib.unquote(file_format["path"]) indexer = MorphlineIndexer(request.user, request.fs) if not request.fs.isfile(path): raise PopupException( _('Path %(path)s is not a file') % file_format) stream = request.fs.open(path) format_ = indexer.guess_format( {"file": { "stream": stream, "name": path }}) _convert_format(format_) elif file_format['inputFormat'] == 'table': db = dbms.get(request.user) try: table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) except Exception, e: raise PopupException( e.message if hasattr(e, 'message') and e.message else e) storage = {} for delim in table_metadata.storage_details: if delim['data_type']: if '=' in delim['data_type']: key, val = delim['data_type'].split('=', 1) storage[key] = val else: storage[delim['data_type']] = delim['comment'] if table_metadata.details['properties']['format'] == 'text': format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage.get('field.delim', ',') } elif table_metadata.details['properties']['format'] == 'parquet': format_ = { "type": "parquet", "hasHeader": False, } else: raise PopupException( 'Hive table format %s is not supported.' % table_metadata.details['properties']['format'])
def test_end_to_end(self): if not is_live_cluster(): # Skipping as requires morplines libs to be setup raise SkipTest() cluster = shared_cluster() fs = cluster.fs make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False) user = User.objects.get(username="******") collection_name = "test_collection" indexer = MorphlineIndexer("test", fs=fs, jt=cluster.jt, solr_client=self.solr_client) input_loc = "/tmp/test.csv" # upload the test file to hdfs fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True) # open a filestream for the file on hdfs stream = fs.open(input_loc) # guess the format of the file file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format}) format_ = field_types.copy() format_['format'] = file_type_format # find a field name available to use for the record's uuid unique_field = indexer.get_unique_field(format_) is_unique_generated = indexer.is_unique_generated(format_) # generate morphline morphline = indexer.generate_morphline_config(collection_name, format_, unique_field) schema_fields = indexer.get_kept_field_list(format_['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] # create the collection from the specified fields collection_manager = CollectionManagerController("test") if collection_manager.collection_exists(collection_name): collection_manager.delete_collection(collection_name, None) collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) # index the file indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
def _small_indexing(user, fs, client, source, destination, index_name): kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): path = urllib.unquote(source["path"]) stats = fs.stats(path) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) _create_solr_collection(user, fs, client, destination, index_name, kwargs) if source['inputFormat'] == 'file': path = urllib.unquote(source["path"]) data = fs.read(path, 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' kwargs['map'] = 'NULL:' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get('id') else source['query'] notebook = Notebook(document=Document2.objects.document(user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [field['name'] for field in fields if field['name'] != 'hue_id'] fetch_handle = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning elif source['inputFormat'] == 'manual': pass # No need to do anything else: response = client.index(name=index_name, data=data, **kwargs) errors = [error.get('message', '') for error in response['responseHeader'].get('errors', [])] except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn('Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
def _index(request, file_format, collection_name, query=None, start_time=None, lib_path=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name): client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field ) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % file_format["path"] elif file_format['inputFormat'] == 'hs2_handle': searcher = CollectionManagerController(request.user) columns = ['_uuid'] + [field['name'] for field in file_format['columns']] return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle']) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name): client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field # No df currently ) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % urllib.unquote(file_format["path"]) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
def _create_solr_collection(user, fs, client, destination, index_name, kwargs): unique_key_field = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None df = destination['indexerDefaultField'] and destination[ 'indexerDefaultField'][0] or None indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) skip_fields = [field['name'] for field in fields if not field['keep']] kwargs['fieldnames'] = ','.join([field['name'] for field in fields]) for field in fields: for operation in field['operations']: if operation['type'] == 'split': field[ 'multiValued'] = True # Solr requires multiValued to be set when splitting kwargs['f.%(name)s.split' % field] = 'true' kwargs['f.%(name)s.separator' % field] = operation['settings']['splitChar'] or ',' if skip_fields: kwargs['skip'] = ','.join(skip_fields) fields = [ field for field in fields if field['name'] not in skip_fields ] if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not destination['hasHeader']: kwargs['header'] = 'false' else: kwargs['skipLines'] = 1 if not client.exists(index_name): client.create_index( name=index_name, config_name=destination.get('indexerConfigSet'), fields=fields, unique_key_field=unique_key_field, df=df, shards=destination['indexerNumShards'], replication=destination['indexerReplicationFactor'])
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warning( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': data = get_topic_data(request.user, file_format.get('kafkaSelectedTopics')) kafkaFieldNames = [col['name'] for col in data['full_headers']] kafkaFieldTypes = [col['type'] for col in data['full_headers']] topics_data = data['rows'] format_ = { "sample": topics_data, "columns": [ Field(col, 'string', unique=False).to_dict() for col in kafkaFieldNames ] } # data = """%(kafkaFieldNames)s # %(data)s""" % { # 'kafkaFieldNames': ','.join(kafkaFieldNames), # 'data': '\n'.join([','.join(cols) for cols in topics_data]) # } # stream = string_io() # stream.write(data) # _convert_format(file_format["format"], inverse=True) # indexer = MorphlineIndexer(request.user, request.fs) # format_ = indexer.guess_field_types({ # "file": { # "stream": stream, # "name": file_format['path'] # }, # "format": file_format['format'] # }) # type_mapping = dict( # list( # zip(kafkaFieldNames, kafkaFieldTypes) # ) # ) # for col in format_['columns']: # col['keyType'] = type_mapping[col['name']] # col['type'] = type_mapping[col['name']] elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': path = urllib_unquote(file_format["path"]) indexer = MorphlineIndexer(request.user, request.fs) if not request.fs.isfile(path): raise PopupException( _('Path %(path)s is not a file') % file_format) stream = request.fs.open(path) format_ = indexer.guess_format( {"file": { "stream": stream, "name": path }}) _convert_format(format_) elif file_format['inputFormat'] == 'table': db = dbms.get(request.user) try: table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) except Exception as e: raise PopupException( e.message if hasattr(e, 'message') and e.message else e) storage = {} for delim in table_metadata.storage_details: if delim['data_type']: if '=' in delim['data_type']: key, val = delim['data_type'].split('=', 1) storage[key] = val else: storage[delim['data_type']] = delim['comment'] if table_metadata.details['properties']['format'] == 'text': format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage.get('field.delim', ',') } elif table_metadata.details['properties']['format'] == 'parquet': format_ = { "type": "parquet", "hasHeader": False, } else: raise PopupException( 'Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = { "quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001" } elif file_format['inputFormat'] == 'rdbms': format_ = {"type": "csv"} elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': format_ = { "type": "json", # "fieldSeparator": ",", # "hasHeader": True, # "quoteChar": "\"", # "recordSeparator": "\\n", 'topics': get_topics(request.user) } elif file_format['streamSelection'] == 'flume': format_ = { "type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n" } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) format_ = { "type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n", 'objects': [ sobject['name'] for sobject in sf.restful('sobjects/')['sobjects'] if sobject['queryable'] ] } else: raise PopupException( _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s' ) % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) format_['status'] = 0 return JsonResponse(format_)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib.unquote(file_format["path"]) stream = request.fs.open(path) encoding = chardet.detect(stream.read(10000)).get('encoding') stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get('id') else file_format['query'] notebook = Notebook(document=Document2.objects.document(user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception, e: LOG.warn('Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, }
] } elif file_format['inputFormat'] == 'stream': # Note: mocked here, should come from SFDC or Kafka API or sampling job if file_format['streamSelection'] == 'kafka': data = """%(kafkaFieldNames)s %(data)s""" % { 'kafkaFieldNames': file_format.get('kafkaFieldNames', ''), 'data': '\n'.join([','.join(['...'] * len(file_format.get('kafkaFieldTypes', '').split(',')))] * 5) } stream = StringIO.StringIO() stream.write(data) _convert_format(file_format["format"], inverse=True) indexer = MorphlineIndexer(request.user, request.fs) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) type_mapping = dict(zip(file_format['kafkaFieldNames'].split(','), file_format['kafkaFieldTypes'].split(','))) for col in format_['columns']: col['keyType'] = type_mapping[col['name']] col['type'] = type_mapping[col['name']] elif file_format['streamSelection'] == 'sfdc': sf = Salesforce( username=file_format['streamUsername'],
def _envelope_job(request, file_format, destination, start_time=None, lib_path=None): collection_name = destination['name'] indexer = EnvelopeIndexer(request.user, request.fs) lib_path = None # Todo optional input field input_path = None if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = file_format["path"] properties = {'input_path': input_path, 'format': 'csv'} elif file_format['inputFormat'] == 'stream' and file_format[ 'streamSelection'] == 'flume': pass elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': manager = ManagerApi() properties = { "brokers": manager.get_kafka_brokers(), "topics": file_format['kafkaSelectedTopics'], "kafkaFieldType": file_format['kafkaFieldType'], "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'], } if file_format.get( 'kafkaSelectedTopics') == 'NavigatorAuditEvents': schema_fields = MorphlineIndexer.get_kept_field_list( file_format['sampleCols']) properties.update({ "kafkaFieldNames": ', '.join([_field['name'] for _field in schema_fields]), "kafkaFieldTypes": ', '.join([_field['type'] for _field in schema_fields]) }) else: properties.update({ "kafkaFieldNames": file_format['kafkaFieldNames'], "kafkaFieldTypes": file_format['kafkaFieldTypes'] }) if True: properties['window'] = '' else: # For "KafkaSQL" properties['window'] = ''' window { enabled = true milliseconds = 60000 }''' elif file_format['inputFormat'] == 'connector': if file_format['streamSelection'] == 'flume': properties = { 'streamSelection': file_format['streamSelection'], 'channelSourceHosts': file_format['channelSourceHosts'], 'channelSourceSelectedHosts': file_format['channelSourceSelectedHosts'], 'channelSourcePath': file_format['channelSourcePath'], } else: # sfdc properties = { 'streamSelection': file_format['streamSelection'], 'streamUsername': file_format['streamUsername'], 'streamPassword': file_format['streamPassword'], 'streamToken': file_format['streamToken'], 'streamEndpointUrl': file_format['streamEndpointUrl'], 'streamObject': file_format['streamObject'], } if destination['outputFormat'] == 'table': if destination['isTargetExisting']: # Todo: check if format matches pass else: destination['importData'] = False # Avoid LOAD DATA if destination['tableFormat'] == 'kudu': properties['kafkaFieldNames'] = properties[ 'kafkaFieldNames'].lower( ) # Kudu names should be all lowercase # Create table if not request.POST.get('show_command'): SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file( file_format, destination).execute(request) if destination['tableFormat'] == 'kudu': manager = ManagerApi() properties["output_table"] = "impala::%s" % collection_name properties["kudu_master"] = manager.get_kudu_master() else: properties['output_table'] = collection_name elif destination['outputFormat'] == 'stream': manager = ManagerApi() properties['brokers'] = manager.get_kafka_brokers() properties['topics'] = file_format['kafkaSelectedTopics'] properties['kafkaFieldDelimiter'] = file_format['kafkaFieldDelimiter'] elif destination['outputFormat'] == 'file': properties['path'] = file_format["path"] if file_format['inputFormat'] == 'stream': properties['format'] = 'csv' else: properties['format'] = file_format['tableFormat'] # or csv elif destination['outputFormat'] == 'index': properties['collectionName'] = collection_name properties['connection'] = SOLR_URL.get() properties["app_name"] = 'Data Ingest' properties["inputFormat"] = file_format['inputFormat'] properties["ouputFormat"] = destination['ouputFormat'] properties["streamSelection"] = file_format["streamSelection"] configs = indexer.generate_config(properties) if request.POST.get('show_command'): return {'status': 0, 'commands': configs['envelope.conf']} else: return indexer.run(request, collection_name, configs, input_path, start_time=start_time, lib_path=lib_path)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) stream = request.fs.open(path) encoding = chardet.detect(stream.read(10000)).get('encoding') stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warn( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': if file_format.get( 'kafkaSelectedTopics') == 'NavigatorAuditEvents': kafkaFieldNames = [ 'id', 'additionalInfo', 'allowed', 'collectionName', 'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst', 'entityId', 'family', 'impersonator', 'ip', 'name', 'objectType', 'objType', 'objUsageType', 'operationParams', 'operationText', 'op', 'opText', 'path', 'perms', 'privilege', 'qualifier', 'QUERY_ID', 'resourcePath', 'service', 'SESSION_ID', 'solrVersion', 'src', 'status', 'subOperation', 'tableName', 'table', 'time', 'type', 'url', 'user' ] kafkaFieldTypes = ['string'] * len(kafkaFieldNames) kafkaFieldNames.append('timeDate') kafkaFieldTypes.append('date') else: # Note: mocked here, should come from SFDC or Kafka API or sampling job kafkaFieldNames = file_format.get('kafkaFieldNames', '').split(',') kafkaFieldTypes = file_format.get('kafkaFieldTypes', '').split(',') data = """%(kafkaFieldNames)s %(data)s""" % { 'kafkaFieldNames': ','.join(kafkaFieldNames), 'data': '\n'.join( [','.join(['...'] * len(kafkaFieldTypes))] * 5) } stream = string_io() stream.write(data) _convert_format(file_format["format"], inverse=True) indexer = MorphlineIndexer(request.user, request.fs) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) type_mapping = dict(list(zip(kafkaFieldNames, kafkaFieldTypes))) for col in format_['columns']: col['keyType'] = type_mapping[col['name']] col['type'] = type_mapping[col['name']] elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)
def _small_indexing(user, fs, client, source, destination, index_name): unique_key_field = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None df = destination['indexerDefaultField'] and destination[ 'indexerDefaultField'][0] or None kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): stats = fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) skip_fields = [field['name'] for field in fields if not field['keep']] kwargs['fieldnames'] = ','.join([field['name'] for field in fields]) if skip_fields: kwargs['skip'] = ','.join(skip_fields) fields = [ field for field in fields if field['name'] not in skip_fields ] if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not destination['hasHeader']: kwargs['header'] = 'false' else: kwargs['skipLines'] = 1 if not client.exists(index_name): client.create_index( name=index_name, config_name=destination.get('indexerConfigSet'), fields=fields, unique_key_field=unique_key_field, df=df, shards=destination['indexerNumShards'], replication=destination['indexerReplicationFactor']) if source['inputFormat'] == 'file': data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get( 'id') else source['query'] notebook = Notebook(document=Document2.objects.document( user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [ field['name'] for field in fields if field['name'] != 'hue_id' ] fetch_handle = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over ) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning else: response = client.index(name=index_name, data=data, **kwargs) errors = [ error.get('message', '') for error in response['responseHeader'].get('errors', []) ] except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) file_type = file_format['file_type'] path = urllib_unquote(file_format["path"]) if sys.version_info[0] < 3 and (file_type == 'excel' or path[-3:] == 'xls' or path[-4:] == 'xlsx'): return JsonResponse({ 'status': -1, 'message': 'Python2 based Hue does not support Excel file importer' }) if file_format['inputFormat'] == 'localfile': if file_type == 'excel': format_ = {"type": "excel", "hasHeader": True} else: format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": True, "fieldSeparator": "," } elif file_format['inputFormat'] == 'file': if path[-3:] == 'xls' or path[-4:] == 'xlsx': file_obj = request.fs.open(path) if path[-3:] == 'xls': df = pd.read_excel(file_obj.read(1024 * 1024 * 1024), engine='xlrd') else: df = pd.read_excel(file_obj.read(1024 * 1024 * 1024), engine='openpyxl') _csv_data = df.to_csv(index=False) path = excel_to_csv_file_name_change(path) request.fs.create(path, overwrite=True, data=_csv_data) indexer = MorphlineIndexer(request.user, request.fs) if not request.fs.isfile(path): raise PopupException( _('Path %(path)s is not a file') % file_format) stream = request.fs.open(path) format_ = indexer.guess_format( {"file": { "stream": stream, "name": path }}) _convert_format(format_) if file_format["path"][-3:] == 'xls' or file_format["path"][ -4:] == 'xlsx': format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "excel", "hasHeader": True, "fieldSeparator": "," } elif file_format['inputFormat'] == 'table': db = dbms.get(request.user) try: table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) except Exception as e: raise PopupException( e.message if hasattr(e, 'message') and e.message else e) storage = {} for delim in table_metadata.storage_details: if delim['data_type']: if '=' in delim['data_type']: key, val = delim['data_type'].split('=', 1) storage[key] = val else: storage[delim['data_type']] = delim['comment'] if table_metadata.details['properties']['format'] == 'text': format_ = { "quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage.get('field.delim', ',') } elif table_metadata.details['properties']['format'] == 'parquet': format_ = { "type": "parquet", "hasHeader": False, } else: raise PopupException( 'Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = { "quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001" } elif file_format['inputFormat'] == 'rdbms': format_ = {"type": "csv"} elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': format_ = { "type": "json", # "fieldSeparator": ",", # "hasHeader": True, # "quoteChar": "\"", # "recordSeparator": "\\n", 'topics': get_topics(request.user) } elif file_format['streamSelection'] == 'flume': format_ = { "type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n" } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) format_ = { "type": "csv", "fieldSeparator": ",", "hasHeader": True, "quoteChar": "\"", "recordSeparator": "\\n", 'objects': [ sobject['name'] for sobject in sf.restful('sobjects/')['sobjects'] if sobject['queryable'] ] } else: raise PopupException( _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s' ) % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) format_['status'] = 0 return JsonResponse(format_)