def _index(request, file_format, collection_name, query=None, start_time=None, lib_path=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name): client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field ) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % file_format["path"] elif file_format['inputFormat'] == 'hs2_handle': searcher = CollectionManagerController(request.user) columns = ['_uuid'] + [field['name'] for field in file_format['columns']] return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle']) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name): client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field # No df currently ) if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % urllib.unquote(file_format["path"]) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
def importer_submit(request): source = json.loads(request.POST.get('source', '{}')) outputFormat = json.loads(request.POST.get('destination', '{}'))['outputFormat'] destination = json.loads(request.POST.get('destination', '{}')) destination['ouputFormat'] = outputFormat # Workaround a very weird bug start_time = json.loads(request.POST.get('start_time', '-1')) if destination['ouputFormat'] == 'index': source['columns'] = destination['columns'] index_name = destination["name"] if destination['indexerRunJob']: _convert_format(source["format"], inverse=True) job_handle = _index(request, source, index_name, start_time=start_time, lib_path=destination['indexerJobLibPath']) else: client = SolrClient(request.user) unique_key_field = destination[ 'indexerDefaultField'] and destination['indexerDefaultField'][ 0] or None df = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None kwargs = {} stats = request.fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(request.user, request.fs) fields = indexer.get_kept_field_list(source['columns']) if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not client.exists(index_name): client.create_index(name=index_name, fields=fields, unique_key_field=unique_key_field, df=df) data = request.fs.read(source["path"], 0, MAX_UPLOAD_SIZE) client.index(name=index_name, data=data, **kwargs) job_handle = { 'status': 0, 'on_success_url': reverse('search:browse', kwargs={'name': index_name}) } elif destination['ouputFormat'] == 'database': job_handle = _create_database(request, source, destination, start_time) else: job_handle = _create_table(request, source, destination, start_time) return JsonResponse(job_handle)
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None): indexer = MorphlineIndexer(request.user, request.fs) unique_field = indexer.get_unique_field(file_format) is_unique_generated = indexer.is_unique_generated(file_format) schema_fields = indexer.get_kept_field_list(file_format['columns']) if is_unique_generated: schema_fields += [{"name": unique_field, "type": "string"}] client = SolrClient(user=request.user) if not client.exists(collection_name) and not request.POST.get('show_command'): # if destination['isTargetExisting']: client.create_index( name=collection_name, fields=request.POST.get('fields', schema_fields), unique_key_field=unique_field # No df currently ) else: # TODO: check if format matches pass if file_format['inputFormat'] == 'table': db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) input_path = table_metadata.path_location elif file_format['inputFormat'] == 'stream' and file_format['streamSelection'] == 'flume': indexer = FlumeIndexer(user=request.user) if request.POST.get('show_command'): configs = indexer.generate_config(file_format, destination) return {'status': 0, 'commands': configs[-1]} else: return indexer.start(collection_name, file_format, destination) elif file_format['inputFormat'] == 'stream': return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path) elif file_format['inputFormat'] == 'file': input_path = '${nameNode}%s' % urllib_unquote(file_format["path"]) else: input_path = None morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path) return indexer.run_morphline( request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path )
def create_index(request): response = {'status': -1} name = request.POST.get('name') fields = json.loads(request.POST.get('fields', '[]')) unique_key_field = request.POST.get('unique_key_field') df = request.POST.get('df') client = SolrClient(request.user) collection = client.create_index(name=name, fields=fields, unique_key_field=unique_key_field, df=df) response['status'] = 0 response['collection'] = collection response['message'] = _('Index created!') return JsonResponse(response)
class Command(BaseCommand): """ Install examples but do not overwrite them. """ def handle(self, *args, **options): self.user = install_sample_user() self.client = SolrClient(self.user) collection = options['data'] if collection == 'twitter_demo': LOG.info("Installing twitter collection") path = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../../../../../../apps/search/examples/collections/solr_configs_twitter_demo/index_data.csv' )) self._setup_collection_from_csv( { 'name': 'twitter_demo', 'fields': self._parse_fields(path, fieldtypes={ 'source': 'string', 'username': '******', }), 'uniqueKeyField': 'id', 'df': 'text' }, path) LOG.info("Twitter collection successfully installed") if collection == 'yelp_demo': LOG.info("Installing yelp collection") path = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../../../../../../apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv' )) self._setup_collection_from_csv( { 'name': 'yelp_demo', 'fields': self._parse_fields(path, fieldtypes={ 'name': 'string', }), 'uniqueKeyField': 'id', 'df': 'text' }, path) LOG.info("Yelp collection successfully installed") if collection == 'log_analytics_demo': LOG.info("Installing logs collection") path = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../../../../../../apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv' )) self._setup_collection_from_csv( { 'name': 'log_analytics_demo', 'fields': self._parse_fields(path, fieldtypes={ 'region_code': 'string', 'referer': 'string', 'user_agent': 'string' }), 'uniqueKeyField': 'id', 'df': 'record' }, path) LOG.info("Logs collection successfully installed") def _setup_collection_from_csv(self, collection, path): if not self.client.exists(collection['name']): self.client.create_index( name=collection['name'], fields=collection['fields'], unique_key_field=collection['uniqueKeyField'], df=collection['df']) with open(path) as fh: self.client.index(collection['name'], fh.read()) def _parse_fields(self, path, separator=',', quote_character='"', fieldtypes={}): with open(path) as fh: field_generator = utils.field_values_from_separated_file( fh, separator, quote_character) row = next(field_generator) field_names = list(row.keys()) field_types = utils.get_field_types( (list(row.values()) for row in itertools.chain([row], field_generator)), iterations=51) return [{ 'name': field[0], 'type': field[0] in fieldtypes and fieldtypes[field[0]] or field[1] } for field in zip(field_names, field_types)]