示例#1
0
文件: bq.py 项目: xinghun61/infra
def _export_builds(dataset, table_name, builds, deadline):
    """Saves builds to BigQuery.

  Logs insert errors and returns a list of ids of builds that could not be
  inserted.
  """
    # BigQuery API doc:
    # https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll
    logging.info('sending %d rows', len(builds))

    pairs = [(b, build_pb2.Build()) for b in builds]
    model.builds_to_protos_async(
        pairs,
        load_tags=True,
        load_input_properties=True,
        load_output_properties=True,
        load_steps=True,
        load_infra=True,
    ).get_result()

    # Clear fields that we don't want in BigQuery.
    for _, proto in pairs:
        proto.infra.buildbucket.hostname = ''
        for s in proto.steps:
            s.summary_markdown = ''
            s.ClearField('logs')

    res = net.json_request(
        url=(('https://www.googleapis.com/bigquery/v2/'
              'projects/%s/datasets/%s/tables/%s/insertAll') %
             (app_identity.get_application_id(), dataset, table_name)),
        method='POST',
        payload={
            'kind':
            'bigquery#tableDataInsertAllRequest',
            # Do not fail entire request because of one bad build.
            # We handle invalid rows below.
            'skipInvalidRows':
            True,
            'ignoreUnknownValues':
            False,
            'rows': [{
                'insertId': str(p.id),
                'json': bqh.message_to_dict(p),
            } for _, p in pairs],
        },
        scopes=bqh.INSERT_ROWS_SCOPE,
        # deadline parameter here is duration in seconds.
        deadline=(deadline - utils.utcnow()).total_seconds(),
    )

    failed_ids = []
    for err in res.get('insertErrors', []):
        _, bp = pairs[err['index']]
        failed_ids.append(bp.id)
        logging.error('failed to insert row for build %d: %r', bp.id,
                      err['errors'])
    return failed_ids
示例#2
0
def _send_to_bq(snapshots):
    """Sends the snapshots to BigQuery.

  Returns:
    Timestamps, encoded as strings, of snapshots that failed to be sent
  """
    # See doc/Monitoring.md.
    dataset = 'isolated'
    table_name = 'stats'

    # BigQuery API doc:
    # https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll
    url = (
        'https://www.googleapis.com/bigquery/v2/projects/%s/datasets/%s/tables/'
        '%s/insertAll') % (app_identity.get_application_id(), dataset,
                           table_name)
    payload = {
        'kind':
        'bigquery#tableDataInsertAllRequest',
        # Do not fail entire request because of one bad snapshot.
        # We handle invalid rows below.
        'skipInvalidRows':
        True,
        'ignoreUnknownValues':
        False,
        'rows': [{
            'insertId': s.timestamp_str,
            'json': bqh.message_to_dict(_to_proto(s)),
        } for s in snapshots],
    }
    res = net.json_request(url=url,
                           method='POST',
                           payload=payload,
                           scopes=bqh.INSERT_ROWS_SCOPE,
                           deadline=600)

    failed = []
    for err in res.get('insertErrors', []):
        t = snapshots[err['index']].timestamp_str
        if not failed:
            # Log the error for the first entry, useful to diagnose schema failure.
            logging.error('Failed to insert row %s: %r', t, err['errors'])
        failed.append(t)
    return failed
示例#3
0
def _send_to_bq_raw(dataset, table_name, rows):
    """Sends the rows to BigQuery.

  Arguments:
    dataset: BigQuery dataset name that contains the table.
    table_name: BigQuery table to stream the rows to.
    rows: list of (row_id, row) rows to sent to BQ.

  Returns:
    indexes of rows that failed to be sent.
  """
    # BigQuery API doc:
    # https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll
    url = (
        'https://www.googleapis.com/bigquery/v2/projects/%s/datasets/%s/tables/'
        '%s/insertAll') % (app_identity.get_application_id(), dataset,
                           table_name)
    payload = {
        'kind':
        'bigquery#tableDataInsertAllRequest',
        # Do not fail entire request because of one bad row.
        # We handle invalid rows below.
        'skipInvalidRows':
        True,
        'ignoreUnknownValues':
        False,
        'rows': [{
            'insertId': row_id,
            'json': bqh.message_to_dict(row)
        } for row_id, row in rows],
    }
    res = net.json_request(url=url,
                           method='POST',
                           payload=payload,
                           scopes=bqh.INSERT_ROWS_SCOPE,
                           deadline=600)

    dropped = 0
    failed = []
    # Use this error message string to detect the error where we're pushing data
    # that is too old. This can occasionally happen as a cron job looks for old
    # entity and by the time it's sending them BigQuery doesn't accept them, just
    # skip these and log a warning.
    out_of_time = (
        'You can only stream to date range within 365 days in the past '
        'and 183 days in the future relative to the current date')
    # https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll#response
    for line in res.get('insertErrors', []):
        i = line['index']
        err = line['errors'][0]
        if err['reason'] == 'invalid' and out_of_time in err['message']:
            # Silently drop it. The rationale is that if it is not skipped, the loop
            # will get stuck on it.
            dropped += 1
            continue
        if not failed:
            # Log the error for the first entry, useful to diagnose schema failure.
            logging.error('Failed to insert row %s: %r', i, err)
        failed.append(i)
    if dropped:
        logging.warning('%d old rows silently dropped', dropped)
    return failed