def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = Indexer(request.user, request.fs) if not request.fs.isfile(file_format["path"]): raise PopupException(_('Path %(path)s is not a file') % file_format) stream = request.fs.open(file_format["path"]) format_ = indexer.guess_format({ "file": { "stream": stream, "name": file_format['path'] } }) _convert_format(format_) elif file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) storage = dict([(delim['data_type'], delim['comment']) for delim in table_metadata.storage_details]) if table_metadata.details['properties']['format'] == 'text': format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage['serialization.format']} elif table_metadata.details['properties']['format'] == 'parquet': format_ = {"type": "parquet", "hasHeader": False,} else: raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001"} format_['status'] = 0 return JsonResponse(format_)
def test_guess_csv_format(self): stream = StringIO.StringIO(IndexerTest.simpleCSVString) indexer = Indexer("test", None) guessed_format = indexer.guess_format( {'file': { "stream": stream, "name": "test.csv" }}) fields = indexer.guess_field_types({ "file": { "stream": stream, "name": "test.csv" }, "format": guessed_format })['columns'] # test format expected_format = self.simpleCSVFormat assert_equal(expected_format, guessed_format) # test fields expected_fields = self.simpleCSVFields for expected, actual in zip(expected_fields, fields): for key in ("name", "type"): assert_equal(expected[key], actual[key])
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': #TODO get schema from explain query pass return JsonResponse(format_)
def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) format_ = indexer.guess_format({ "file":{ "stream": stream, "name": file_format['path'] } }) _convert_format(format_) elif file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) storage = dict([(delim['data_type'], delim['comment']) for delim in table_metadata.storage_details]) if table_metadata.details['properties']['format'] == 'text': format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage['serialization.format']} elif table_metadata.details['properties']['format'] == 'parquet': format_ = {"type": "parquet", "hasHeader": False,} else: raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format']) elif file_format['inputFormat'] == 'query': format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\t"} # \t --> CTRL+A return JsonResponse(format_)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format[ 'inputFormat'] == 'query': # Only support open query history # TODO get schema from explain query, which is not possible notebook = Notebook(document=Document2.objects.get( id=file_format['query'])).get_data() snippet = notebook['snippets'][0] sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True) format_ = { "sample": sample['rows'][:4], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample.meta ] } return JsonResponse(format_)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse = True) format_ = indexer.guess_field_types({"file":stream, "format":file_format['format']}) return JsonResponse(format_)
def test_generate_csv_morphline(self): indexer = Indexer("test") morphline =indexer.generate_morphline_config("test_collection", { "columns": self.simpleCSVFields, "format": self.simpleCSVFormat }) assert_true(isinstance(morphline, basestring))
def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) format_ = indexer.guess_format({"file":stream}) _convert_format(format_) return JsonResponse(format_)
def test_generate_csv_morphline(self): indexer = Indexer("test", None) morphline = indexer.generate_morphline_config( "test_collection", { "columns": self.simpleCSVFields, "format": self.simpleCSVFormat }) assert_true(isinstance(morphline, basestring))
def test_end_to_end(self): if not is_live_cluster(): raise SkipTest() cluster = shared_cluster() fs = cluster.fs collection_name = "test_collection" indexer = Indexer("test", fs=fs, jt=cluster.jt) input_loc = "/tmp/test.csv" # upload the test file to hdfs fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True) # open a filestream for the file on hdfs stream = fs.open(input_loc) # guess the format of the file file_type_format = indexer.guess_format( {'file': { "stream": stream, "name": "test.csv" }}) field_types = indexer.guess_field_types({ "file": { "stream": stream, "name": "test.csv" }, "format": file_type_format }) format_ = field_types.copy() format_['format'] = file_type_format # find a field name available to use for the record's uuid unique_field = indexer.get_unique_field(format_) is_unique_generated = indexer.is_unique_generated(format_) # generate morphline morphline = indexer.generate_morphline_config(collection_name, format_, unique_field) schema_fields = indexer.get_kept_field_list(format_['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] # create the collection from the specified fields collection_manager = CollectionManagerController("test") if collection_manager.collection_exists(collection_name): collection_manager.delete_collection(collection_name, None) collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) # index the file indexer.run_morphline(collection_name, morphline, input_loc)
def _test_fixed_type_format_generate_morphline(format_): indexer = Indexer("test") format_instance = format_() morphline = indexer.generate_morphline_config("test_collection", { "columns": [field.to_dict() for field in format_instance.fields], "format": format_instance.get_format() }) assert_true(isinstance(morphline, basestring))
def _test_generate_field_operation_morphline(operation_format): fields = TestIndexer.simpleCSVFields[:] fields[0]['operations'].append(operation_format) indexer = Indexer("test") morphline =indexer.generate_morphline_config("test_collection", { "columns": fields, "format": TestIndexer.simpleCSVFormat }) assert_true(isinstance(morphline, basestring))
def _test_fixed_type_format_generate_morphline(format_): indexer = Indexer("test", None) format_instance = format_() morphline = indexer.generate_morphline_config( "test_collection", { "columns": [field.to_dict() for field in format_instance.fields], "format": format_instance.get_format() }) assert_true(isinstance(morphline, basestring))
def _test_generate_field_operation_morphline(operation_format): fields = IndexerTest.simpleCSVFields[:] fields[0]['operations'].append(operation_format) indexer = Indexer("test", None) morphline = indexer.generate_morphline_config( "test_collection", { "columns": fields, "format": IndexerTest.simpleCSVFormat }) assert_true(isinstance(morphline, basestring))
def guess_format(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) format_ = indexer.guess_format( {"file": { "stream": stream, "name": file_format['path'] }}) _convert_format(format_) return JsonResponse(format_)
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) return JsonResponse(format_)
def test_guess_format_invalid_csv_format(self): indexer = Indexer("test", None) stream = StringIO.StringIO(IndexerTest.simpleCSVString) guessed_format = indexer.guess_format( {'file': { "stream": stream, "name": "test.csv" }}) guessed_format["fieldSeparator"] = "invalid separator" fields = indexer.guess_field_types({ "file": { "stream": stream, "name": "test.csv" }, "format": guessed_format })['columns'] assert_equal(fields, []) stream.seek(0) guessed_format = indexer.guess_format( {'file': { "stream": stream, "name": "test.csv" }}) guessed_format["recordSeparator"] = "invalid separator" fields = indexer.guess_field_types({ "file": { "stream": stream, "name": "test.csv" }, "format": guessed_format })['columns'] assert_equal(fields, []) stream.seek(0) guessed_format = indexer.guess_format( {'file': { "stream": stream, "name": "test.csv" }}) guessed_format["quoteChar"] = "invalid quoteChar" fields = indexer.guess_field_types({ "file": { "stream": stream, "name": "test.csv" }, "format": guessed_format })['columns'] assert_equal(fields, [])
def test_guess_format_invalid_csv_format(self): indexer = Indexer("test", None) stream = StringIO.StringIO(IndexerTest.simpleCSVString) guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) guessed_format["fieldSeparator"] = "invalid separator" fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns'] assert_equal(fields, []) stream.seek(0) guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) guessed_format["recordSeparator"] = "invalid separator" fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns'] assert_equal(fields, []) stream.seek(0) guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) guessed_format["quoteChar"] = "invalid quoteChar" fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns'] assert_equal(fields, [])
def index_file(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) _convert_format(file_format["format"], inverse=True) collection_name = file_format["name"] indexer = Indexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field) collection_manager = CollectionManagerController(request.user) if not collection_manager.collection_exists(collection_name): collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) job_id = indexer.run_morphline(collection_name, morphline, file_format["path"]) return JsonResponse({"jobId": job_id})
def _index(request, file_format, collection_name, query=None): indexer = Indexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] collection_manager = CollectionManagerController(request.user) if not collection_manager.collection_exists(collection_name): collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % file_format["path"] elif file_format['inputFormat'] == 'hs2_handle': searcher = CollectionManagerController(request.user) columns = ['_uuid'] + [field['name'] for field in file_format['columns']] return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle']) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field) return indexer.run_morphline(request, collection_name, morphline, input_path, query)
def test_guess_csv_format(self): stream = StringIO.StringIO(TestIndexer.simpleCSVString) indexer = Indexer("test") guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) fields = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns'] # test format expected_format = self.simpleCSVFormat assert_equal(expected_format, guessed_format) # test fields expected_fields = self.simpleCSVFields for expected, actual in zip(expected_fields, fields): for key in ("name", "type"): assert_equal(expected[key], actual[key])
def index_file(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) _convert_format(file_format["format"], inverse = True) collection_name = file_format["name"] indexer = Indexer(request.user, request.fs) unique_field = indexer.get_uuid_name(file_format) schema_fields = [{"name": unique_field, "type": "string"}] + \ indexer.get_kept_field_list(file_format['columns']) morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field) collection_manager = CollectionManagerController(request.user) if not collection_manager.collection_exists(collection_name): collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) job_id = indexer.run_morphline(collection_name, morphline, file_format["path"]) return JsonResponse({"jobId": job_id})
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'file': indexer = Indexer(request.user, request.fs) stream = request.fs.open(file_format["path"]) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": file_format['path'] }, "format": file_format['format'] }) elif file_format['inputFormat'] == 'table': sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': # Only support open query history # TODO get schema from explain query, which is not possible notebook = Notebook(document=Document2.objects.get(id=file_format['query'])).get_data() snippet = notebook['snippets'][0] sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True) format_ = { "sample": sample['rows'][:4], "sample_cols": sample.meta, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in sample.meta ] } return JsonResponse(format_)
def test_end_to_end(self): if not is_live_cluster() or True: # Skipping as requires morplines libs to be setup raise SkipTest() cluster = shared_cluster() fs = cluster.fs make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False) user = User.objects.get(username="******") collection_name = "test_collection" indexer = Indexer("test", fs=fs, jt=cluster.jt) input_loc = "/tmp/test.csv" # upload the test file to hdfs fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True) # open a filestream for the file on hdfs stream = fs.open(input_loc) # guess the format of the file file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format}) format_ = field_types.copy() format_['format'] = file_type_format # find a field name available to use for the record's uuid unique_field = indexer.get_unique_field(format_) is_unique_generated = indexer.is_unique_generated(format_) # generate morphline morphline = indexer.generate_morphline_config(collection_name, format_, unique_field) schema_fields = indexer.get_kept_field_list(format_['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] # create the collection from the specified fields collection_manager = CollectionManagerController("test") if collection_manager.collection_exists(collection_name): collection_manager.delete_collection(collection_name, None) collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) # index the file indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
def _index(request, file_format, collection_name, query=None): indexer = Indexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field) collection_manager = CollectionManagerController(request.user) if not collection_manager.collection_exists(collection_name): collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % file_format["path"] else: input_path = None return indexer.run_morphline(request, collection_name, morphline, input_path, query)
def index_file(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) _convert_format(file_format["format"], inverse=True) collection_name = file_format["name"] indexer = Indexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field) collection_manager = CollectionManagerController(request.user) if not collection_manager.collection_exists(collection_name): collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location else: input_path = file_format["path"] job_handle = indexer.run_morphline(request, collection_name, morphline, input_path) #TODO if query generate insert return JsonResponse(job_handle)
def test_guess_format(self): stream = StringIO.StringIO(IndexerTest.simpleCSVString) indexer = Indexer("test", None) guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) fields = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns'] # test format assert_equal('csv', guessed_format['type']) assert_equal(',', guessed_format['fieldSeparator']) assert_equal('\n', guessed_format['recordSeparator']) # test fields expected_fields = [ { "name": "id", "type": "long" }, { "name": "Rating", "type": "long" }, { "name": "Location", "type": "string" }, { "name": "Name", "type": "string" }, { "name": "Time", "type": "string" } ] for expected, actual in zip(expected_fields, fields): for key in ("name", "type"): assert_equal(expected[key], actual[key])
def test_end_to_end(self): fs = cluster.get_hdfs() collection_name = "test_collection" indexer = Indexer("test", fs) input_loc = "/tmp/test.csv" # upload the test file to hdfs fs.create(input_loc, data=IndexerTest.simpleCSVString, overwrite=True) # open a filestream for the file on hdfs stream = fs.open(input_loc) # guess the format of the file file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format}) format_ = field_types.copy() format_['format'] = file_type_format # find a field name available to use for the record's uuid unique_field = indexer.get_uuid_name(format_) # generate morphline morphline = indexer.generate_morphline_config(collection_name, format_, unique_field) schema_fields = [{"name": unique_field, "type": "string"}] + indexer.get_kept_field_list(format_['columns']) # create the collection from the specified fields collection_manager = CollectionManagerController("test") if collection_manager.collection_exists(collection_name): collection_manager.delete_collection(collection_name, None) collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) # index the file indexer.run_morphline(collection_name, morphline, input_loc)