def collections_data(request, collection): if request.method != 'POST': raise PopupException(_('POST request required.')) response = {'status': -1} source = request.POST.get('source') if source == 'file': searcher = CollectionManagerController(request.user) searcher.update_data_from_hdfs(request.fs, collection, None, request.POST.get('path'), request.POST.get('type'), separator=request.POST.get('separator'), quote_character=request.POST.get('quote')) response['status'] = 0 response['message'] = _('Index imported!') else: response['message'] = _('Unsupported source %s') % source return JsonResponse(response)
def _index(request, file_format, collection_name, query=None): indexer = Indexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field) collection_manager = CollectionManagerController(request.user) if not collection_manager.collection_exists(collection_name): collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % file_format["path"] else: input_path = None return indexer.run_morphline(request, collection_name, morphline, input_path, query)
def collections_import(request): if request.method != 'POST': raise PopupException(_('POST request required.')) response = {'status': -1} collection = json.loads(request.POST.get('collection', '{}')) if collection: searcher = CollectionManagerController(request.user) unique_key, fields = searcher.get_fields(collection.get('name')) # Create collection and metadata. hue_collection, created = Collection.objects.get_or_create(name=collection.get('name'), solr_properties='{}', is_enabled=True, user=request.user) properties_dict = hue_collection.properties_dict properties_dict['data_type'] = 'separated' properties_dict['field_order'] = [field_name for field_name in fields] hue_collection.properties = json.dumps(properties_dict) hue_collection.save() response['status'] = 0 response['message'] = _('Collection created!') else: response['message'] = _('Collection missing.') return JsonResponse(response)
def index_file(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) _convert_format(file_format["format"], inverse=True) collection_name = file_format["name"] indexer = Indexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field) collection_manager = CollectionManagerController(request.user) if not collection_manager.collection_exists(collection_name): collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location else: input_path = file_format["path"] job_handle = indexer.run_morphline(request, collection_name, morphline, input_path) #TODO if query generate insert return JsonResponse(job_handle)
def autocomplete(request): searcher = CollectionManagerController(request.user) autocomplete = searcher.get_autocomplete() massaged_collections = [] for collection in autocomplete['collections']: massaged_collections.append({ 'name': collection, 'isCollection': True, 'isConfig': False, }) for config in autocomplete['configs']: massaged_collections.append({ 'name': config, 'isCollection': False, 'isConfig': True, }) response = { 'status': 0, 'collections': massaged_collections } return JsonResponse(response)
def test_collections_fields(self): db = CollectionManagerController(self.user) db.get_fields('log_analytics') resp = self.client.post(reverse('indexer:install_examples')) content = json.loads(resp.content) assert_equal(content.get('status'), 0) assert_equal(content.get('fields'), 0) assert_equal(content.get('unique_key'), 0)
def test_create_collection(self): db = CollectionManagerController(self.user) name = get_db_prefix(name='solr') + 'test_create_collection' fields = [{'name': 'my_test', 'type': 'text'}] try: db.create_collection(name, fields, unique_key_field='id', df='text') finally: db.delete_collection(name, core=False)
def collections_fields(request, collection): if request.method != 'GET': raise PopupException(_('GET request required.')) response = {} searcher = CollectionManagerController(request.user) unique_key, fields = searcher.get_fields(collection) response['status'] = 0 response['fields'] = [(field, fields[field]['type'], fields[field].get('indexed', None), fields[field].get('stored', None)) for field in fields] response['unique_key'] = unique_key return JsonResponse(response)
def index_file(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) _convert_format(file_format["format"], inverse = True) collection_name = file_format["name"] indexer = Indexer(request.user, request.fs) unique_field = indexer.get_uuid_name(file_format) schema_fields = [{"name": unique_field, "type": "string"}] + \ indexer.get_kept_field_list(file_format['columns']) morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field) collection_manager = CollectionManagerController(request.user) if not collection_manager.collection_exists(collection_name): collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) job_id = indexer.run_morphline(collection_name, morphline, file_format["path"]) return JsonResponse({"jobId": job_id})
def collections(request): searcher = CollectionManagerController(request.user) solr_collections = searcher.get_collections() massaged_collections = [] for collection in solr_collections: massaged_collections.append({ 'name': collection, 'isCoreOnly': solr_collections[collection]['isCoreOnly'], 'isAlias': solr_collections[collection].get('isAlias', False), 'collections': solr_collections[collection].get('collections', []), }) response = { 'status': 0, 'collections': massaged_collections } return JsonResponse(response)
def collections_update(request, collection): if request.method != 'POST': raise PopupException(_('POST request required.')) response = {'status': -1} collection = json.loads(request.POST.get('collection', '{}')) if not collection: response['message'] = _('No collection to update.') if response.get('message', None) is None: searcher = CollectionManagerController(request.user) searcher.update_collection(collection.get('name'), collection.get('fields', [])) response['status'] = 0 response['message'] = _('Collection updated!') return JsonResponse(response)
def collections_remove(request): if request.method != 'POST': raise PopupException(_('POST request required.')) response = {'status': -1} collections = json.loads(request.POST.get('collections', '[]')) if not collections: response['message'] = _('No collections to remove.') if response.get('message', None) is None: searcher = CollectionManagerController(request.user) solr_collections = searcher.get_collections() for collection in collections: if collection.get('name') in solr_collections: # Remove collection and instancedir searcher.delete_collection(collection.get('name'), collection.get('isCoreOnly')) response['status'] = 0 response['message'] = _('Collections removed!') return JsonResponse(response)
def collections_create(request): if request.method != 'POST': raise PopupException(_('POST request required.')) response = {'status': -1} collection = json.loads(request.POST.get('collection', '{}')) if collection: searcher = CollectionManagerController(request.user) # Create instance directory, collection, and add fields searcher.create_collection(collection.get('name'), collection.get('fields', []), collection.get('uniqueKeyField'), collection.get('df')) try: if request.POST.get('source') == 'file': # Index data searcher.update_data_from_hdfs(request.fs, collection.get('name'), collection.get('fields', []), request.POST.get('path'), request.POST.get('type'), separator=request.POST.get('separator'), quote_character=request.POST.get('quote')) elif request.POST.get('source') == 'hive': # Run a custom hive query and post data to collection from beeswax.server import dbms db = dbms.get(request.user) database = request.POST.get('database') table = request.POST.get('table') columns = [field['name'] for field in collection.get('fields', [])] searcher.update_data_from_hive(db, collection.get('name'), database, table, columns) response['status'] = 0 response['message'] = _('Collection created!') except Exception, e: LOG.error(e) raise
def test_end_to_end(self): if not is_live_cluster() or True: # Skipping as requires morplines libs to be setup raise SkipTest() cluster = shared_cluster() fs = cluster.fs make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False) user = User.objects.get(username="******") collection_name = "test_collection" indexer = Indexer("test", fs=fs, jt=cluster.jt) input_loc = "/tmp/test.csv" # upload the test file to hdfs fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True) # open a filestream for the file on hdfs stream = fs.open(input_loc) # guess the format of the file file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format}) format_ = field_types.copy() format_['format'] = file_type_format # find a field name available to use for the record's uuid unique_field = indexer.get_unique_field(format_) is_unique_generated = indexer.is_unique_generated(format_) # generate morphline morphline = indexer.generate_morphline_config(collection_name, format_, unique_field) schema_fields = indexer.get_kept_field_list(format_['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] # create the collection from the specified fields collection_manager = CollectionManagerController("test") if collection_manager.collection_exists(collection_name): collection_manager.delete_collection(collection_name, None) collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) # index the file indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
def test_end_to_end(self): fs = cluster.get_hdfs() collection_name = "test_collection" indexer = Indexer("test", fs) input_loc = "/tmp/test.csv" # upload the test file to hdfs fs.create(input_loc, data=IndexerTest.simpleCSVString, overwrite=True) # open a filestream for the file on hdfs stream = fs.open(input_loc) # guess the format of the file file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format}) format_ = field_types.copy() format_['format'] = file_type_format # find a field name available to use for the record's uuid unique_field = indexer.get_uuid_name(format_) # generate morphline morphline = indexer.generate_morphline_config(collection_name, format_, unique_field) schema_fields = [{"name": unique_field, "type": "string"}] + indexer.get_kept_field_list(format_['columns']) # create the collection from the specified fields collection_manager = CollectionManagerController("test") if collection_manager.collection_exists(collection_name): collection_manager.delete_collection(collection_name, None) collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) # index the file indexer.run_morphline(collection_name, morphline, input_loc)
def _small_indexing(user, fs, client, source, destination, index_name): kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): path = urllib_unquote(source["path"]) stats = fs.stats(path) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) _create_solr_collection(user, fs, client, destination, index_name, kwargs) if source['inputFormat'] == 'file': path = urllib_unquote(source["path"]) data = fs.read(path, 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' kwargs['map'] = 'NULL:' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get( 'id') else source['query'] notebook = Notebook(document=Document2.objects.document( user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [ field['name'] for field in fields if field['name'] != 'hue_id' ] fetch_handle = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over ) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning elif source['inputFormat'] == 'manual': pass # No need to do anything else: response = client.index(name=index_name, data=data, **kwargs) errors = [ error.get('message', '') for error in response['responseHeader'].get('errors', []) ] except Exception as e: try: client.delete_index(index_name, keep_config=False) except Exception as e2: LOG.warn( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2)) raise e return { 'status': 0, 'on_success_url': reverse('indexer:indexes', kwargs={'index': index_name}), 'pub_sub_url': 'assist.collections.refresh', 'errors': errors }
def test_is_solr_cloud_mode(self): assert_true( CollectionManagerController(self.user).is_solr_cloud_mode())
def test_collection_exists(self): db = CollectionManagerController(self.user) assert_false(db.collection_exists('does_not_exist'))
def test_get_collections(self): db = CollectionManagerController(self.user) db.get_collections()
def _small_indexing(user, fs, client, source, destination, index_name): unique_key_field = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None df = destination['indexerDefaultField'] and destination[ 'indexerDefaultField'][0] or None kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): stats = fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) skip_fields = [field['name'] for field in fields if not field['keep']] kwargs['fieldnames'] = ','.join([field['name'] for field in fields]) for field in fields: for operation in field['operations']: if operation['type'] == 'split': field[ 'multiValued'] = True # Solr requires multiValued to be set when splitting kwargs['f.%(name)s.split' % field] = 'true' kwargs['f.%(name)s.separator' % field] = operation['settings']['splitChar'] or ',' if skip_fields: kwargs['skip'] = ','.join(skip_fields) fields = [ field for field in fields if field['name'] not in skip_fields ] if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not destination['hasHeader']: kwargs['header'] = 'false' else: kwargs['skipLines'] = 1 if not client.exists(index_name): client.create_index( name=index_name, config_name=destination.get('indexerConfigSet'), fields=fields, unique_key_field=unique_key_field, df=df, shards=destination['indexerNumShards'], replication=destination['indexerReplicationFactor']) if source['inputFormat'] == 'file': data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' kwargs['map'] = 'NULL:' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get( 'id') else source['query'] notebook = Notebook(document=Document2.objects.document( user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [ field['name'] for field in fields if field['name'] != 'hue_id' ] fetch_handle = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over ) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning else: response = client.index(name=index_name, data=data, **kwargs) errors = [ error.get('message', '') for error in response['responseHeader'].get('errors', []) ] except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
def test_end_to_end(self): if not is_live_cluster( ) or True: # Skipping as requires morplines libs to be setup raise SkipTest() cluster = shared_cluster() fs = cluster.fs make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False) user = User.objects.get(username="******") collection_name = "test_collection" indexer = Indexer("test", fs=fs, jt=cluster.jt) input_loc = "/tmp/test.csv" # upload the test file to hdfs fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True) # open a filestream for the file on hdfs stream = fs.open(input_loc) # guess the format of the file file_type_format = indexer.guess_format( {'file': { "stream": stream, "name": "test.csv" }}) field_types = indexer.guess_field_types({ "file": { "stream": stream, "name": "test.csv" }, "format": file_type_format }) format_ = field_types.copy() format_['format'] = file_type_format # find a field name available to use for the record's uuid unique_field = indexer.get_unique_field(format_) is_unique_generated = indexer.is_unique_generated(format_) # generate morphline morphline = indexer.generate_morphline_config(collection_name, format_, unique_field) schema_fields = indexer.get_kept_field_list(format_['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] # create the collection from the specified fields collection_manager = CollectionManagerController("test") if collection_manager.collection_exists(collection_name): collection_manager.delete_collection(collection_name, None) collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field) # index the file indexer.run_morphline( MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)