Exemplo n.º 1
0
Arquivo: api.py Projeto: gilvbp/hue
def _get_sample_data(db,
                     database,
                     table,
                     column,
                     is_async=False,
                     cluster=None,
                     operation=None):
    if operation == 'hello':
        table_obj = None
    else:
        table_obj = db.get_table(database, table)
        if table_obj.is_impala_only and db.client.query_server[
                'server_name'] != 'impala':  # Kudu table, now Hive should support it though
            query_server = get_query_server_config('impala', connector=cluster)
            db = dbms.get(db.client.user, query_server, cluster=cluster)

    sample_data = db.get_sample(database,
                                table_obj,
                                column,
                                generate_sql_only=is_async,
                                operation=operation)
    response = {'status': -1}

    if sample_data:
        response['status'] = 0
        if is_async:
            notebook = make_notebook(name=_(
                'Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % {
                    'database': database,
                    'table': table,
                    'column': column
                },
                                     editor_type=_get_servername(db),
                                     statement=sample_data,
                                     status='ready-execute',
                                     skip_historify=True,
                                     is_task=False,
                                     compute=cluster if cluster else None)
            response['result'] = notebook.execute(
                request=MockedDjangoRequest(user=db.client.user), batch=False)
            if table_obj.is_impala_only:
                response['result']['type'] = 'impala'
        else:
            sample = escape_rows(sample_data.rows(), nulls_only=True)
            if column:
                sample = set([row[0] for row in sample])
                sample = [[item] for item in sorted(list(sample))]

            response['headers'] = sample_data.cols()
            response['full_headers'] = sample_data.full_cols()
            response['rows'] = sample
    else:
        response['message'] = _('Failed to get sample data.')

    return response
Exemplo n.º 2
0
def _small_indexing(user, fs, client, source, destination, index_name):
  kwargs = {}
  errors = []

  if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
    path = urllib.unquote(source["path"])
    stats = fs.stats(path)
    if stats.size > MAX_UPLOAD_SIZE:
      raise PopupException(_('File size is too large to handle!'))

  indexer = MorphlineIndexer(user, fs)

  fields = indexer.get_field_list(destination['columns'])
  _create_solr_collection(user, fs, client, destination, index_name, kwargs)

  if source['inputFormat'] == 'file':
    path = urllib.unquote(source["path"])
    data = fs.read(path, 0, MAX_UPLOAD_SIZE)

  if client.is_solr_six_or_more():
    kwargs['processor'] = 'tolerant'
    kwargs['map'] = 'NULL:'

  try:
    if source['inputFormat'] == 'query':
      query_id = source['query']['id'] if source['query'].get('id') else source['query']

      notebook = Notebook(document=Document2.objects.document(user=user, doc_id=query_id)).get_data()
      request = MockedDjangoRequest(user=user)
      snippet = notebook['snippets'][0]

      searcher = CollectionManagerController(user)
      columns = [field['name'] for field in fields if field['name'] != 'hue_id']
      fetch_handle = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over) # Assumes handle still live
      rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs)
      # TODO if rows == MAX_ROWS truncation warning
    elif source['inputFormat'] == 'manual':
      pass # No need to do anything
    else:
      response = client.index(name=index_name, data=data, **kwargs)
      errors = [error.get('message', '') for error in response['responseHeader'].get('errors', [])]
  except Exception, e:
    try:
      client.delete_index(index_name, keep_config=False)
    except Exception, e2:
      LOG.warn('Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
Exemplo n.º 3
0
def run_sync_query(doc_id, user):
    '''Independently run a query as a user.'''
    # Add INSERT INTO table if persist result
    # Add variable substitution
    # Send notifications: done/on failure
    if type(user) is str:
        lookup = {orm_user_lookup(): user}
        user = User.objects.get(**lookup)
        user = rewrite_user(user)

    query_document = Document2.objects.get_by_uuid(user=user, uuid=doc_id)
    notebook = Notebook(document=query_document).get_data()
    snippet = notebook['snippets'][0]

    editor_type = snippet['type']
    sql = _get_statement(notebook)
    request = MockedDjangoRequest(user=user)
    last_executed = time.mktime(datetime.datetime.now().timetuple()) * 1000

    notebook = make_notebook(name='Scheduled query %s at %s' %
                             (query_document.name, last_executed),
                             editor_type=editor_type,
                             statement=sql,
                             status='ready',
                             last_executed=last_executed,
                             is_task=True)

    task = notebook.execute(request, batch=True)

    task['uuid'] = task['history_uuid']
    status = check_status(task)

    while status['status'] in ('waiting', 'running'):
        status = check_status(task)
        time.sleep(3)

    return task
Exemplo n.º 4
0
def run_sync_query(doc_id, user):
    '''Independently run a query as a user and insert the result into another table.'''
    # get SQL
    # Add INSERT INTO table
    # Add variables?
    # execute query
    # return when done. send email notification. get taskid.
    # see in Flower API for listing runs?
    from django.contrib.auth.models import User
    from notebook.models import make_notebook, MockedDjangoRequest

    from desktop.auth.backend import rewrite_user

    editor_type = 'impala'
    sql = 'INSERT into customer_scheduled SELECT * FROM default.customers LIMIT 100;'
    request = MockedDjangoRequest(
        user=rewrite_user(User.objects.get(username='******')))

    notebook = make_notebook(
        name='Scheduler query N',
        editor_type=editor_type,
        statement=sql,
        status='ready',
        #on_success_url=on_success_url,
        last_executed=time.mktime(datetime.datetime.now().timetuple()) * 1000,
        is_task=True)

    task = notebook.execute(request, batch=True)

    task['uuid'] = task['history_uuid']
    status = check_status(task)

    while status['status'] in ('waiting', 'running'):
        status = check_status(task)
        time.sleep(3)

    return task
Exemplo n.º 5
0
  sample_data = db.get_sample(database, table_obj, column, generate_sql_only=async, operation=operation)
  response = {'status': -1}

  if sample_data:
    response['status'] = 0
    if async:
      notebook = make_notebook(
          name=_('Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % {'database': database, 'table': table, 'column': column},
          editor_type=_get_servername(db),
          statement=sample_data,
          status='ready-execute',
          skip_historify=True,
          is_task=False,
          compute=cluster if cluster else None
      )
      response['result'] = notebook.execute(request=MockedDjangoRequest(user=db.client.user), batch=False)
      if table_obj.is_impala_only:
        response['result']['type'] = 'impala'
    else:
      sample = escape_rows(sample_data.rows(), nulls_only=True)
      if column:
        sample = set([row[0] for row in sample])
        sample = [[item] for item in sorted(list(sample))]

      response['headers'] = sample_data.cols()
      response['full_headers'] = sample_data.full_cols()
      response['rows'] = sample
  else:
    response['message'] = _('Failed to get sample data.')

  return response
Exemplo n.º 6
0
Arquivo: api.py Projeto: offcocoa/hue
    if sample_data:
        response['status'] = 0
        if async:
            notebook = make_notebook(name=_(
                'Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % {
                    'database': database,
                    'table': table,
                    'column': column
                },
                                     editor_type=_get_servername(db),
                                     statement=sample_data,
                                     status='ready-execute',
                                     skip_historify=True,
                                     is_task=False)
            response['result'] = notebook.execute(
                request=MockedDjangoRequest(user=db.client.user), batch=False)
            if table_obj.is_impala_only:
                response['result']['type'] = 'impala'
        else:
            sample = escape_rows(sample_data.rows(), nulls_only=True)
            if column:
                sample = set([row[0] for row in sample])
                sample = [[item] for item in sorted(list(sample))]

            response['headers'] = sample_data.cols()
            response['full_headers'] = sample_data.full_cols()
            response['rows'] = sample
    else:
        response['message'] = _('Failed to get sample data.')

    return response
Exemplo n.º 7
0
def _small_indexing(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] == 'file':
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over
                )  # Assumes handle still live
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception, e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception, e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))
Exemplo n.º 8
0
Arquivo: api.py Projeto: xbing1221/hue
    query_server = get_query_server_config('impala')
    db = dbms.get(db.client.user, query_server)

  sample_data = db.get_sample(database, table_obj, column, generate_sql_only=async)
  response = {'status': -1}

  if sample_data:
    if async:
      notebook = make_notebook(
          name=_('Table sample for `%(database)s`.`%(table)s`.`%(column)s`') % {'database': database, 'table': table, 'column': column},
          editor_type=db.server_name,
          statement=sample_data,
          status='ready',
          is_task=False
      )
      task = notebook.execute(request=MockedDjangoRequest(user=db.client.user), batch=False)
      response['history_id'] = task['history_id']
      response['history_uuid'] = task['history_uuid']
    else:
      sample = escape_rows(sample_data.rows(), nulls_only=True)
      if column:
        sample = set([row[0] for row in sample])
        sample = [[item] for item in sorted(list(sample))]

      response['status'] = 0
      response['headers'] = sample_data.cols()
      response['full_headers'] = sample_data.full_cols()
      response['rows'] = sample
  else:
    response['message'] = _('Failed to get sample data.')