Exemplo n.º 1
0
def upload_table_stats(request):
    response = {'status': -1}

    db_tables = json.loads(request.POST.get('dbTables'), '[]')
    source_platform = request.POST.get('sourcePlatform', 'hive')
    with_columns = json.loads(request.POST.get('with_columns', 'false'))

    table_stats = []
    column_stats = []

    for db_table in db_tables:
        path = _get_table_name(db_table)

        try:
            full_table_stats = json.loads(
                get_table_stats(request,
                                database=path['database'],
                                table=path['table']).content)
            stats = dict((stat['data_type'], stat['comment'])
                         for stat in full_table_stats['stats'])

            table_stats.append((db_table, stats.get('numRows', -1)))

            if with_columns:
                for col in full_table_stats['columns']:
                    col_stats = json.loads(
                        get_table_stats(request,
                                        database=path['database'],
                                        table=path['table'],
                                        column=col).content)['stats']
                    col_stats = dict([(key, val) for col_stat in col_stats
                                      for key, val in col_stat.iteritems()])

                    column_stats.append(
                        (db_table, col, col_stats['data_type'],
                         int(col_stats.get('distinct_count'))
                         if col_stats.get('distinct_count') != '' else -1,
                         int(col_stats['num_nulls'])
                         if col_stats['num_nulls'] != '' else -1,
                         int(float(col_stats['avg_col_len']))
                         if col_stats['avg_col_len'] != '' else -1))
        except Exception, e:
            LOG.exception('Skipping upload of %s: %s' % (db_table, e))
Exemplo n.º 2
0
def upload_table_stats(request):
    response = {'status': -1}

    interface = request.POST.get('interface', OPTIMIZER.INTERFACE.get())
    db_tables = json.loads(request.POST.get('db_tables', '[]'))
    source_platform = json.loads(request.POST.get('sourcePlatform', '"hive"'))
    with_ddl = json.loads(request.POST.get('with_ddl', 'false'))
    with_table_stats = json.loads(request.POST.get('with_table', 'false'))
    with_columns_stats = json.loads(request.POST.get('with_columns', 'false'))

    table_ddls = []
    table_stats = []
    column_stats = []

    if not OPTIMIZER.AUTO_UPLOAD_DDL.get():
        with_ddl = False

    if not OPTIMIZER.AUTO_UPLOAD_STATS.get():
        with_table_stats = with_columns_stats = False

    for db_table in db_tables:
        path = _get_table_name(db_table)

        try:
            if with_ddl:
                db = _get_db(request.user, source_type=source_platform)
                query = hql_query(
                    'SHOW CREATE TABLE `%(database)s`.`%(table)s`' % path)
                handle = db.execute_and_wait(query, timeout_sec=5.0)

                if handle:
                    result = db.fetch(handle, rows=5000)
                    db.close(handle)
                    table_ddls.append(
                        (0, 0, ' '.join([row[0] for row in result.rows()]),
                         path['database']))

            if with_table_stats:
                mock_request = MockRequest(user=request.user,
                                           source_platform=source_platform)
                full_table_stats = json.loads(
                    get_table_stats(mock_request,
                                    database=path['database'],
                                    table=path['table']).content)
                stats = dict((stat['data_type'], stat['comment'])
                             for stat in full_table_stats['stats'])

                table_stats.append({
                    'table_name':
                    '%(database)s.%(table)s' % path,  # DB Prefix
                    'num_rows':
                    stats.get('numRows', -1),
                    'last_modified_time':
                    stats.get('transient_lastDdlTime', -1),
                    'total_size':
                    stats.get('totalSize', -1),
                    'raw_data_size':
                    stats.get('rawDataSize', -1),
                    'num_files':
                    stats.get('numFiles', -1),
                    'num_partitions':
                    stats.get('numPartitions', -1),
                    # bytes_cached
                    # cache_replication
                    # format
                })

            if with_columns_stats:
                if source_platform == 'impala':
                    colum_stats = json.loads(
                        get_table_stats(mock_request,
                                        database=path['database'],
                                        table=path['table'],
                                        column=-1).content)['stats']
                else:
                    colum_stats = [
                        json.loads(
                            get_table_stats(mock_request,
                                            database=path['database'],
                                            table=path['table'],
                                            column=col).content)['stats']
                        for col in full_table_stats['columns'][:25]
                    ]

                raw_column_stats = [
                    dict([(key, val if val is not None else '')
                          for col_stat in col
                          for key, val in col_stat.items()])
                    for col in colum_stats
                ]

                for col_stats in raw_column_stats:
                    column_stats.append({
                        'table_name':
                        '%(database)s.%(table)s' % path,  # DB Prefix
                        'column_name':
                        col_stats['col_name'],
                        'data_type':
                        col_stats['data_type'],
                        "num_distinct":
                        int(col_stats.get('distinct_count'))
                        if col_stats.get('distinct_count') != '' else -1,
                        "num_nulls":
                        int(col_stats['num_nulls'])
                        if col_stats['num_nulls'] != '' else -1,
                        "avg_col_len":
                        int(float(col_stats['avg_col_len']))
                        if col_stats['avg_col_len'] != '' else -1,
                        "max_size":
                        int(float(col_stats['max_col_len']))
                        if col_stats['max_col_len'] != '' else -1,
                        "min":
                        col_stats['min']
                        if col_stats.get('min', '') != '' else -1,
                        "max":
                        col_stats['max']
                        if col_stats.get('max', '') != '' else -1,
                        "num_trues":
                        col_stats['num_trues']
                        if col_stats.get('num_trues', '') != '' else -1,
                        "num_falses":
                        col_stats['num_falses']
                        if col_stats.get('num_falses', '') != '' else -1,
                    })
        except Exception as e:
            LOG.exception('Skipping upload of %s: %s' % (db_table, e))

    api = get_api(request, interface)

    response['status'] = 0

    if table_stats:
        response['upload_table_stats'] = api.upload(
            data=table_stats,
            data_type='table_stats',
            source_platform=source_platform)
        response['upload_table_stats_status'] = 0 if response[
            'upload_table_stats']['status']['state'] in ('WAITING', 'FINISHED',
                                                         'IN_PROGRESS') else -1
        response['status'] = response['upload_table_stats_status']
    if column_stats:
        response['upload_cols_stats'] = api.upload(
            data=column_stats,
            data_type='cols_stats',
            source_platform=source_platform)
        response['upload_cols_stats_status'] = response['status'] if response[
            'upload_cols_stats']['status']['state'] in ('WAITING', 'FINISHED',
                                                        'IN_PROGRESS') else -1
        if response['upload_cols_stats_status'] != 0:
            response['status'] = response['upload_cols_stats_status']
    if table_ddls:
        response['upload_table_ddl'] = api.upload(
            data=table_ddls,
            data_type='queries',
            source_platform=source_platform)
        response['upload_table_ddl_status'] = response['status'] if response[
            'upload_table_ddl']['status']['state'] in ('WAITING', 'FINISHED',
                                                       'IN_PROGRESS') else -1
        if response['upload_table_ddl_status'] != 0:
            response['status'] = response['upload_table_ddl_status']

    return JsonResponse(response)
Exemplo n.º 3
0
def upload_table_stats(request):
  response = {'status': -1}

  db_tables = json.loads(request.POST.get('db_tables'), '[]')
  source_platform = request.POST.get('sourcePlatform', 'hive')
  with_columns = json.loads(request.POST.get('with_columns', 'false'))
  with_ddl = json.loads(request.POST.get('with_ddl', 'false'))

  table_stats = []
  column_stats = []
  table_ddls = []

  for db_table in db_tables:
    path = _get_table_name(db_table)

    try:
      if with_ddl:
        db = dbms.get(request.user)
        query = hql_query('SHOW CREATE TABLE `%(database)s`.`%(table)s`' % path)
        handle = db.execute_and_wait(query, timeout_sec=5.0)

        if handle:
          result = db.fetch(handle, rows=5000)
          db.close(handle)
          table_ddls.append((0, 0, ' '.join([row[0] for row in result.rows()]), path['database']))

      full_table_stats = json.loads(get_table_stats(request, database=path['database'], table=path['table']).content)
      stats = dict((stat['data_type'], stat['comment']) for stat in full_table_stats['stats'])

      table_stats.append({
        'table_name': path['table'],
        'num_rows':  stats.get('numRows', -1),
        'last_modified_time':  stats.get('transient_lastDdlTime', -1),
        'total_size':  stats.get('totalSize', -1),
        'raw_data_size':  stats.get('rawDataSize', -1),
        'num_files':  stats.get('numFiles', -1),
        # bytes_cached
        # cache_replication
        # format
      })

      if with_columns:
        for col in full_table_stats['columns']:
          col_stats = json.loads(get_table_stats(request, database=path['database'], table=path['table'], column=col).content)['stats']
          col_stats = dict([(key, val) for col_stat in col_stats for key, val in col_stat.iteritems()])

          column_stats.append({
            'table_name': path['table'],
            'column_name': col,
            'data_type': col_stats['data_type'],
            "num_distinct": int(col_stats.get('distinct_count')) if col_stats.get('distinct_count') != '' else -1,
            "num_nulls": int(col_stats['num_nulls']) if col_stats['num_nulls'] != '' else -1,
            "avg_col_len": int(float(col_stats['avg_col_len'])) if col_stats['avg_col_len'] != '' else -1,
            "max_size": int(float(col_stats['max_col_len'])) if col_stats['max_col_len'] != '' else -1,
            "min": col_stats['min'] if col_stats.get('min', '') != '' else -1,
            "max": col_stats['max'] if col_stats.get('max', '') != '' else -1,
            "num_trues": col_stats['num_trues'] if col_stats.get('num_trues', '') != '' else -1,
            "num_falses": col_stats['num_falses'] if col_stats.get('num_falses', '') != '' else -1,
          })
    except Exception, e:
      LOG.exception('Skipping upload of %s: %s' % (db_table, e))