示例#1
0
文件: bigquery.py 项目: zhouzach/beam
  def insert_rows(self, project_id, dataset_id, table_id, rows):
    """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

    # Prepare rows for insertion. Of special note is the row ID that we add to
    # each row in order to help BigQuery avoid inserting a row multiple times.
    # BigQuery will do a best-effort if unique IDs are provided. This situation
    # can happen during retries on failures.
    # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
    final_rows = []
    for row in rows:
      json_object = bigquery.JsonObject()
      for k, v in row.iteritems():
        json_object.additionalProperties.append(
            bigquery.JsonObject.AdditionalProperty(
                key=k, value=to_json_value(v)))
      final_rows.append(
          bigquery.TableDataInsertAllRequest.RowsValueListEntry(
              insertId=str(self.unique_row_id),
              json=json_object))
    result, errors = self._insert_all_rows(
        project_id, dataset_id, table_id, final_rows)
    return result, errors
示例#2
0
    def test_rows_are_written(self):
        client = mock.Mock()
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
                               schema=bigquery.TableSchema())
        client.tables.Get.return_value = table
        write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND

        insert_response = mock.Mock()
        insert_response.insertErrors = []
        client.tabledata.InsertAll.return_value = insert_response

        with beam.io.BigQuerySink(
                'project:dataset.table',
                write_disposition=write_disposition).writer(client) as writer:
            writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

        sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
        expected_rows = []
        json_object = bigquery.JsonObject()
        for k, v in iteritems(sample_row):
            json_object.additionalProperties.append(
                bigquery.JsonObject.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
        expected_rows.append(
            bigquery.TableDataInsertAllRequest.RowsValueListEntry(
                insertId='_1',  # First row ID generated with prefix ''
                json=json_object))
        client.tabledata.InsertAll.assert_called_with(
            bigquery.BigqueryTabledataInsertAllRequest(
                projectId='project',
                datasetId='dataset',
                tableId='table',
                tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                    rows=expected_rows)))
示例#3
0
 def _convert_to_json_row(self, row):
   json_object = bigquery.JsonObject()
   for k, v in iteritems(row):
     if isinstance(v, decimal.Decimal):
       # decimal values are converted into string because JSON does not
       # support the precision that decimal supports. BQ is able to handle
       # inserts into NUMERIC columns by receiving JSON with string attrs.
       v = str(v)
     json_object.additionalProperties.append(
         bigquery.JsonObject.AdditionalProperty(
             key=k, value=to_json_value(v)))
   return json_object
示例#4
0
    def insert_rows(self,
                    project_id,
                    dataset_id,
                    table_id,
                    rows,
                    skip_invalid_rows=False):
        """Inserts rows into the specified table.

    Args:
      project_id: The project id owning the table.
      dataset_id: The dataset id owning the table.
      table_id: The table id.
      rows: A list of plain Python dictionaries. Each dictionary is a row and
        each key in it is the name of a field.
      skip_invalid_rows: If there are rows with insertion errors, whether they
        should be skipped, and all others should be inserted successfully.

    Returns:
      A tuple (bool, errors). If first element is False then the second element
      will be a bigquery.InserttErrorsValueListEntry instance containing
      specific errors.
    """

        # Prepare rows for insertion. Of special note is the row ID that we add to
        # each row in order to help BigQuery avoid inserting a row multiple times.
        # BigQuery will do a best-effort if unique IDs are provided. This situation
        # can happen during retries on failures.
        # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
        final_rows = []
        for row in rows:
            json_object = bigquery.JsonObject()
            for k, v in iteritems(row):
                if isinstance(v, decimal.Decimal):
                    # decimal values are converted into string because JSON does not
                    # support the precision that decimal supports. BQ is able to handle
                    # inserts into NUMERIC columns by receiving JSON with string attrs.
                    v = str(v)
                json_object.additionalProperties.append(
                    bigquery.JsonObject.AdditionalProperty(
                        key=k, value=to_json_value(v)))
            final_rows.append(
                bigquery.TableDataInsertAllRequest.RowsValueListEntry(
                    insertId=str(self.unique_row_id), json=json_object))
        result, errors = self._insert_all_rows(project_id, dataset_id,
                                               table_id, final_rows,
                                               skip_invalid_rows)
        return result, errors