def _get_sample_data(db, database, table, column, is_async=False, cluster=None, operation=None): if operation == 'hello': table_obj = None else: table_obj = db.get_table(database, table) if table_obj.is_impala_only and db.client.query_server[ 'server_name'] != 'impala': # Kudu table, now Hive should support it though query_server = get_query_server_config('impala', connector=cluster) db = dbms.get(db.client.user, query_server, cluster=cluster) sample_data = db.get_sample(database, table_obj, column, generate_sql_only=is_async, operation=operation) response = {'status': -1} if sample_data: response['status'] = 0 if is_async: notebook = make_notebook(name=_( 'Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % { 'database': database, 'table': table, 'column': column }, editor_type=_get_servername(db), statement=sample_data, status='ready-execute', skip_historify=True, is_task=False, compute=cluster if cluster else None) response['result'] = notebook.execute( request=MockedDjangoRequest(user=db.client.user), batch=False) if table_obj.is_impala_only: response['result']['type'] = 'impala' else: sample = escape_rows(sample_data.rows(), nulls_only=True) if column: sample = set([row[0] for row in sample]) sample = [[item] for item in sorted(list(sample))] response['headers'] = sample_data.cols() response['full_headers'] = sample_data.full_cols() response['rows'] = sample else: response['message'] = _('Failed to get sample data.') return response
def _small_indexing(user, fs, client, source, destination, index_name): kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): path = urllib.unquote(source["path"]) stats = fs.stats(path) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) _create_solr_collection(user, fs, client, destination, index_name, kwargs) if source['inputFormat'] == 'file': path = urllib.unquote(source["path"]) data = fs.read(path, 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' kwargs['map'] = 'NULL:' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get('id') else source['query'] notebook = Notebook(document=Document2.objects.document(user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [field['name'] for field in fields if field['name'] != 'hue_id'] fetch_handle = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning elif source['inputFormat'] == 'manual': pass # No need to do anything else: response = client.index(name=index_name, data=data, **kwargs) errors = [error.get('message', '') for error in response['responseHeader'].get('errors', [])] except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn('Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
def run_sync_query(doc_id, user): '''Independently run a query as a user.''' # Add INSERT INTO table if persist result # Add variable substitution # Send notifications: done/on failure if type(user) is str: lookup = {orm_user_lookup(): user} user = User.objects.get(**lookup) user = rewrite_user(user) query_document = Document2.objects.get_by_uuid(user=user, uuid=doc_id) notebook = Notebook(document=query_document).get_data() snippet = notebook['snippets'][0] editor_type = snippet['type'] sql = _get_statement(notebook) request = MockedDjangoRequest(user=user) last_executed = time.mktime(datetime.datetime.now().timetuple()) * 1000 notebook = make_notebook(name='Scheduled query %s at %s' % (query_document.name, last_executed), editor_type=editor_type, statement=sql, status='ready', last_executed=last_executed, is_task=True) task = notebook.execute(request, batch=True) task['uuid'] = task['history_uuid'] status = check_status(task) while status['status'] in ('waiting', 'running'): status = check_status(task) time.sleep(3) return task
def run_sync_query(doc_id, user): '''Independently run a query as a user and insert the result into another table.''' # get SQL # Add INSERT INTO table # Add variables? # execute query # return when done. send email notification. get taskid. # see in Flower API for listing runs? from django.contrib.auth.models import User from notebook.models import make_notebook, MockedDjangoRequest from desktop.auth.backend import rewrite_user editor_type = 'impala' sql = 'INSERT into customer_scheduled SELECT * FROM default.customers LIMIT 100;' request = MockedDjangoRequest( user=rewrite_user(User.objects.get(username='******'))) notebook = make_notebook( name='Scheduler query N', editor_type=editor_type, statement=sql, status='ready', #on_success_url=on_success_url, last_executed=time.mktime(datetime.datetime.now().timetuple()) * 1000, is_task=True) task = notebook.execute(request, batch=True) task['uuid'] = task['history_uuid'] status = check_status(task) while status['status'] in ('waiting', 'running'): status = check_status(task) time.sleep(3) return task
sample_data = db.get_sample(database, table_obj, column, generate_sql_only=async, operation=operation) response = {'status': -1} if sample_data: response['status'] = 0 if async: notebook = make_notebook( name=_('Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % {'database': database, 'table': table, 'column': column}, editor_type=_get_servername(db), statement=sample_data, status='ready-execute', skip_historify=True, is_task=False, compute=cluster if cluster else None ) response['result'] = notebook.execute(request=MockedDjangoRequest(user=db.client.user), batch=False) if table_obj.is_impala_only: response['result']['type'] = 'impala' else: sample = escape_rows(sample_data.rows(), nulls_only=True) if column: sample = set([row[0] for row in sample]) sample = [[item] for item in sorted(list(sample))] response['headers'] = sample_data.cols() response['full_headers'] = sample_data.full_cols() response['rows'] = sample else: response['message'] = _('Failed to get sample data.') return response
if sample_data: response['status'] = 0 if async: notebook = make_notebook(name=_( 'Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % { 'database': database, 'table': table, 'column': column }, editor_type=_get_servername(db), statement=sample_data, status='ready-execute', skip_historify=True, is_task=False) response['result'] = notebook.execute( request=MockedDjangoRequest(user=db.client.user), batch=False) if table_obj.is_impala_only: response['result']['type'] = 'impala' else: sample = escape_rows(sample_data.rows(), nulls_only=True) if column: sample = set([row[0] for row in sample]) sample = [[item] for item in sorted(list(sample))] response['headers'] = sample_data.cols() response['full_headers'] = sample_data.full_cols() response['rows'] = sample else: response['message'] = _('Failed to get sample data.') return response
def _small_indexing(user, fs, client, source, destination, index_name): unique_key_field = destination['indexerPrimaryKey'] and destination[ 'indexerPrimaryKey'][0] or None df = destination['indexerDefaultField'] and destination[ 'indexerDefaultField'][0] or None kwargs = {} errors = [] if source['inputFormat'] not in ('manual', 'table', 'query_handle'): stats = fs.stats(source["path"]) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) indexer = MorphlineIndexer(user, fs) fields = indexer.get_field_list(destination['columns']) skip_fields = [field['name'] for field in fields if not field['keep']] kwargs['fieldnames'] = ','.join([field['name'] for field in fields]) if skip_fields: kwargs['skip'] = ','.join(skip_fields) fields = [ field for field in fields if field['name'] not in skip_fields ] if not unique_key_field: unique_key_field = 'hue_id' fields += [{"name": unique_key_field, "type": "string"}] kwargs['rowid'] = unique_key_field if not destination['hasHeader']: kwargs['header'] = 'false' else: kwargs['skipLines'] = 1 if not client.exists(index_name): client.create_index( name=index_name, config_name=destination.get('indexerConfigSet'), fields=fields, unique_key_field=unique_key_field, df=df, shards=destination['indexerNumShards'], replication=destination['indexerReplicationFactor']) if source['inputFormat'] == 'file': data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE) if client.is_solr_six_or_more(): kwargs['processor'] = 'tolerant' try: if source['inputFormat'] == 'query': query_id = source['query']['id'] if source['query'].get( 'id') else source['query'] notebook = Notebook(document=Document2.objects.document( user=user, doc_id=query_id)).get_data() request = MockedDjangoRequest(user=user) snippet = notebook['snippets'][0] searcher = CollectionManagerController(user) columns = [ field['name'] for field in fields if field['name'] != 'hue_id' ] fetch_handle = lambda rows, start_over: get_api( request, snippet).fetch_result( notebook, snippet, rows=rows, start_over=start_over ) # Assumes handle still live rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs) # TODO if rows == MAX_ROWS truncation warning else: response = client.index(name=index_name, data=data, **kwargs) errors = [ error.get('message', '') for error in response['responseHeader'].get('errors', []) ] except Exception, e: try: client.delete_index(index_name, keep_config=False) except Exception, e2: LOG.warn( 'Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
query_server = get_query_server_config('impala') db = dbms.get(db.client.user, query_server) sample_data = db.get_sample(database, table_obj, column, generate_sql_only=async) response = {'status': -1} if sample_data: if async: notebook = make_notebook( name=_('Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % {'database': database, 'table': table, 'column': column}, editor_type=db.server_name, statement=sample_data, status='ready', is_task=False ) task = notebook.execute(request=MockedDjangoRequest(user=db.client.user), batch=False) response['history_id'] = task['history_id'] response['history_uuid'] = task['history_uuid'] else: sample = escape_rows(sample_data.rows(), nulls_only=True) if column: sample = set([row[0] for row in sample]) sample = [[item] for item in sorted(list(sample))] response['status'] = 0 response['headers'] = sample_data.cols() response['full_headers'] = sample_data.full_cols() response['rows'] = sample else: response['message'] = _('Failed to get sample data.')