Пример #1
0
    def _process_responses_for_bigquery(self, data: Sequence[dict], table_reference: TableReference):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(type_=TimePartitioningType.DAY, field='date')

        job_config.schema = (
            SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('cluster', SqlTypeNames.STRING),
            SchemaField('name', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('query', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'),
            SchemaField('elements', SqlTypeNames.RECORD, 'REPEATED', fields=(
                SchemaField('content', SqlTypeNames.STRING),
                SchemaField('operation', SqlTypeNames.STRING),
                SchemaField('result', SqlTypeNames.STRING),
            )),
        )

        for data_item in data:
            data_item['date'] = data_item['date'].strftime('%Y-%m-%dT%H:%M:%S.%f')

            for element in data_item['elements']:
                if element['result'] is not None:
                    element['result'] = str(element['result'])

        load_job = self.bigquery.client.load_table_from_json(data, table_reference, job_config=job_config)
        load_job.result()
    def _process_response_for_bigquery(self, table_reference: TableReference,
                                       rankings: Sequence[dict]):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')

        job_config.schema = (
            SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'),
            SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('project_id', SqlTypeNames.INTEGER, 'REQUIRED'),
            SchemaField('prev_rank', SqlTypeNames.INTEGER),
            SchemaField('keyword', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('keyword_group', SqlTypeNames.STRING, 'REPEATED'),
            SchemaField('ranking', SqlTypeNames.INTEGER),
            SchemaField('ranking_change', SqlTypeNames.INTEGER, 'REQUIRED'),
            SchemaField('rank', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('scalar_type', SqlTypeNames.STRING, 'REQUIRED'),
        )

        for ranking in rankings:
            ranking['date'] = ranking['date'].strftime('%Y-%m-%dT%H:%M:%S.%f')

        load_job = self.bigquery.client.load_table_from_json(
            rankings, table_reference, job_config=job_config)
        load_job.result()
Пример #3
0
def _execute_load_in_source(context, source, source_name):
    destination = context.solid_config.get('destination')
    load_job_config = _preprocess_config(
        context.solid_config.get('load_job_config', {}))
    cfg = LoadJobConfig(**load_job_config) if load_job_config else None

    context.log.info(
        'executing BQ load with config: %s for source %s' %
        (cfg.to_api_repr() if cfg else '(no config provided)', source))

    if source_name == BigQueryLoadSource.DataFrame:
        context.resources.bigquery.load_table_from_dataframe(
            source, destination, job_config=cfg).result()

    # Load from file. See: https://cloud.google.com/bigquery/docs/loading-data-local
    elif source_name == BigQueryLoadSource.File:
        with open(source, 'rb') as file_obj:
            context.resources.bigquery.load_table_from_file(
                file_obj, destination, job_config=cfg).result()

    # Load from GCS. See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage
    elif source_name == BigQueryLoadSource.GCS:
        context.resources.biquery.load_table_from_uri(source,
                                                      destination,
                                                      job_config=cfg).result()
Пример #4
0
    def import_csv(self,
                   bucket_name: str,
                   bucket_path: str,
                   dataset: str,
                   table: str,
                   sep: str = "\t") -> bool:
        logging.info(
            f"DataWarehouse.import_csv {bucket_path} to {dataset}.{table} ...")
        client = self._get_client()

        config = LoadJobConfig()
        config.autodetect = True
        config.field_delimiter = sep

        bucket_url = f"gs://{self.config.lake_path}/{bucket_path}"

        load_job = client.load_table_from_uri(bucket_url,
                                              f"{dataset}.{table}",
                                              job_config=config)
        result = load_job.result()

        logging.info(
            f"DataWarehouse.import_csv {bucket_path} to {dataset}.{table} Complete!"
        )

        return True
Пример #5
0
    def test_schema_setter_invalid_field(self):
        from google.cloud.bigquery.job import LoadJobConfig
        from google.cloud.bigquery.schema import SchemaField

        config = LoadJobConfig()
        full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
        with self.assertRaises(ValueError):
            config.schema = [full_name, object()]
Пример #6
0
    def populate_table(self, table_path, schema, data=[], make_immediately_available=False,
                       replace_existing_table=False):
        # type: (str, List[SchemaField], Optional[List[Any]], Optional[bool], Optional[bool]) -> None
        """Creates a table and populates it with a list of rows.

        If make_immediately_available is False, the table will be created using streaming inserts.
        Note that streaming inserts are immediately available for querying, but not for exporting or
        copying, so if you need that capability you should set make_immediately_available to True.
        https://cloud.google.com/bigquery/streaming-data-into-bigquery

        If the table is already created, it will raise a RuntimeError, unless replace_existing_table
        is True.

        Args:
          table_path: A string of the form '<dataset id>.<table name>'
              or '<project id>.<dataset id>.<table name>'.
          schema: A list of SchemaFields to represent the table's schema.
          data: A list of rows, each of which corresponds to a row to insert into the table.
          make_immediately_available: If False, the table won't immediately be available for
              copying or exporting, but will be available for querying. If True, after this
              operation returns, it will be available for copying and exporting too.
          replace_existing_table: If set to True, the table at table_path will be deleted and
              recreated if it's already present.

        Raises:
            RuntimeError if the table at table_path is already there and replace_existing_table
                is False
        """
        # Use the Table object so we can pass through the schema.
        table = Table(self.get_table_reference_from_path(table_path), schema)
        if self.table_exists(table):
            if replace_existing_table:
                self.delete_table(table)
            else:
                raise RuntimeError('The table {} already exists.'.format(table_path))
        self.create_table(table)

        if data:
            if make_immediately_available:
                output = cStringIO.StringIO()

                csv_out = csv.writer(output)
                for row in data:
                    csv_out.writerow(row)

                job_config = LoadJobConfig()
                job_config.source_format = 'text/csv'
                # By default this does six retries. It does not accept any other timeout or
                # retry parameters.
                job = self.gclient.load_table_from_file(output, table.reference,
                                                        job_config=job_config,
                                                        rewind=True)
                job.result()

                output.close()
            else:
                self._stream_chunks_of_rows(table, data, schema)
Пример #7
0
    def test_schema_setter(self):
        from google.cloud.bigquery.job import LoadJobConfig
        from google.cloud.bigquery.schema import SchemaField

        config = LoadJobConfig()
        full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
        age = SchemaField("age", "INTEGER", mode="REQUIRED")
        config.schema = [full_name, age]
        self.assertEqual(config.schema, [full_name, age])
Пример #8
0
    def __build_load_config(self, gcs_uri, file_format, schema_file, append):
        gcs_file_ext = os.path.splitext(gcs_uri)[1]
        if gcs_file_ext:
            file_format = gcs_file_ext.lstrip(".")

        jc = LoadJobConfig()
        jc.source_format = self.__get_bq_format(file_format)
        jc.schema = self.__load_schema(schema_file)
        jc.write_disposition = "WRITE_APPEND" if append else "WRITE_TRUNCATE"
        return jc, file_format
Пример #9
0
def _execute_load_in_source(context, source, source_name):
    destination = context.solid_config.get('destination')
    load_job_config = _preprocess_config(context.solid_config.get('load_job_config', {}))
    cfg = LoadJobConfig(**load_job_config) if load_job_config else None

    context.log.info(
        'executing BQ load with config: %s for source %s'
        % (cfg.to_api_repr() if cfg else '(no config provided)', source)
    )

    context.resources.bigquery.load_table_from_source(
        source_name, source, destination, job_config=cfg
    ).result()
Пример #10
0
    def _process_response_rows_for_bigquery(self, dataframe: DataFrame,
                                            table_reference: TableReference):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')
        job_config.schema = [
            self._get_schema_for_field(column)
            for column in list(dataframe.columns.values)
        ]

        load_job = self.bigquery.client.load_table_from_dataframe(
            dataframe, table_reference, job_config=job_config)

        load_job.result()
Пример #11
0
        def _compute_fn(context, inputs):
            destination = context.solid_config.get('destination')
            load_job_config = _preprocess_config(context.solid_config.get('load_job_config', {}))
            cfg = LoadJobConfig(**load_job_config) if load_job_config else None

            context.log.info(
                'executing BQ load with config: %s'
                % (cfg.to_api_repr() if cfg else '(no config provided)')
            )

            context.resources.bq.load_table_from_source(
                source, inputs, destination, job_config=cfg
            ).result()

            yield Result(None)
def test_change_skip_leading_rows():
    loader = DsvDataLoader(table=None,
                           from_=None,
                           load_job_config=LoadJobConfig(),
                           bq_client=None)
    assert loader.load_job_config.skip_leading_rows is None
    loader = loader.skip_leading_rows(1)
    assert loader.load_job_config.skip_leading_rows == 1
def test_change_field_delimiter():
    loader = DsvDataLoader(table=None,
                           from_=None,
                           load_job_config=LoadJobConfig(),
                           bq_client=None)
    assert loader.load_job_config.field_delimiter is None
    loader = loader.with_field_delimiter("\t")
    assert loader.load_job_config.field_delimiter == "\t"
def test_change_null_marker():
    loader = DsvDataLoader(table=None,
                           from_=None,
                           load_job_config=LoadJobConfig(),
                           bq_client=None)
    assert loader.load_job_config.null_marker is None
    loader = loader.with_null_marker("NULL")
    assert loader.load_job_config.null_marker == "NULL"
Пример #15
0
def test_change_encoding():
    loader = JsonDataLoader(table=None,
                            from_=None,
                            load_job_config=LoadJobConfig(),
                            bq_client=None)
    assert loader.load_job_config.encoding is None
    loader = loader.with_encoding("UTF-8")
    assert loader.load_job_config.encoding == "UTF-8"
def test_change_quote_character():
    loader = DsvDataLoader(table=None,
                           from_=None,
                           load_job_config=LoadJobConfig(),
                           bq_client=None)
    assert loader.load_job_config.quote_character is None
    loader = loader.with_quote_character("'")
    assert loader.load_job_config.quote_character == "'"
def test_change_to_partition():
    loader = BaseDataLoader(table=None,
                            from_=None,
                            load_job_config=LoadJobConfig(),
                            bq_client=None)
    assert loader.partition is None
    loader = loader.to_partition("20201023")
    assert loader.partition == "20201023"
Пример #18
0
    def test_begin_w_autodetect(self):
        from google.cloud.bigquery.job import LoadJobConfig

        path = "/projects/{}/jobs".format(self.PROJECT)
        resource = self._make_resource()
        resource["configuration"]["load"]["autodetect"] = True
        # Ensure None for missing server-set props
        del resource["statistics"]["creationTime"]
        del resource["etag"]
        del resource["selfLink"]
        del resource["user_email"]
        conn = _make_connection(resource)
        client = _make_client(project=self.PROJECT, connection=conn)
        config = LoadJobConfig()
        config.autodetect = True
        job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF,
                             client, config)
        with mock.patch(
                "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
        ) as final_attributes:
            job._begin()

        final_attributes.assert_called_with({"path": path}, client, job)

        sent = {
            "jobReference": {
                "projectId": self.PROJECT,
                "jobId": self.JOB_ID
            },
            "configuration": {
                "load": {
                    "sourceUris": [self.SOURCE1],
                    "destinationTable": {
                        "projectId": self.PROJECT,
                        "datasetId": self.DS_ID,
                        "tableId": self.TABLE_ID,
                    },
                    "autodetect": True,
                }
            },
        }
        conn.api_request.assert_called_once_with(method="POST",
                                                 path=path,
                                                 data=sent,
                                                 timeout=None)
        self._verifyResourceProperties(job, resource)
Пример #19
0
 def _make_load_job_config(source_format,  # type: str
                           write_disposition,  # type: str
                           schema=None,  # type: Optional[List[SchemaField]]
                           skip_leading_row=False,  #type: bool
                           ):
     """
     Makes and returns a LoadJobConfig according to the passed-in parameters.
     Args:
         source_format: Should be a recognized BigQuery source format. See
             https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.sourceFormat
         write_disposition: Should be a recognized BigQuery write disposition. See
             https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.writeDisposition
         schema: A list of SchemaFields. If unset, BigQuery will try to infer a schema.
         skip_leading_row: If True, the first row of the file loaded in will be skipped.
     """
     job_config = LoadJobConfig()
     job_config.source_format = source_format
     job_config.write_disposition = write_disposition
     if schema:
         job_config.schema = schema
     else:
         job_config.autodetect = True
     if skip_leading_row:
         job_config.skip_leading_rows = 1
     return job_config
def update_or_create_table_from_csv(
    client: bigquery.Client,
    table_name: str,
    source_file: str,
    dataset: str,
    source_schema_file: str,
):
    LOGGER.debug("update_or_create_table_from_csv: %s=%s", table_name,
                 [source_file])
    dataset_ref = client.dataset(dataset)
    table_ref = dataset_ref.table(table_name)

    job_config = LoadJobConfig()
    job_config.source_format = "CSV"
    job_config.skip_leading_rows = 1
    if Path(source_schema_file).exists():
        job_config.schema = get_table_schema(source_schema_file)
    else:
        job_config.autodetect = True
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    with open(source_file, "rb") as source_fp:
        load_job = client.load_table_from_file(source_fp,
                                               destination=table_ref,
                                               job_config=job_config)

    # wait for job to complete
    load_job.result()

    LOGGER.info("updated config table: %s", table_ref.table_id)
def test_change_ignore_unknown_values():
    loader = BaseDataLoader(table=None,
                            from_=None,
                            load_job_config=LoadJobConfig(),
                            bq_client=None)
    assert loader.load_job_config.ignore_unknown_values is None
    loader = loader.ignore_unknown_values()
    assert loader.load_job_config.ignore_unknown_values is True
    loader = loader.ignore_unknown_values(False)
    assert loader.load_job_config.ignore_unknown_values is False
def test_constructor():
    loader = DsvDataLoader(table=None,
                           from_=None,
                           load_job_config=LoadJobConfig(),
                           bq_client=None)
    assert loader.table is None
    assert loader.from_ is None
    assert loader.load_job_config is not None
    assert loader.load_job_config.source_format == SourceFormat.CSV
    assert loader._bq_client is None
def test_change_allow_quoted_newlines():
    loader = DsvDataLoader(table=None,
                           from_=None,
                           load_job_config=LoadJobConfig(),
                           bq_client=None)
    assert loader.load_job_config.allow_quoted_newlines is None
    loader = loader.allow_quoted_newlines()
    assert loader.load_job_config.allow_quoted_newlines is True
    loader = loader.allow_quoted_newlines(False)
    assert loader.load_job_config.allow_quoted_newlines is False
Пример #24
0
def test_constructor():
    loader = JsonDataLoader(table=None,
                            from_=None,
                            load_job_config=LoadJobConfig(),
                            bq_client=None)
    assert loader.table is None
    assert loader.from_ is None
    assert loader.load_job_config is not None
    assert loader.load_job_config.source_format == SourceFormat.NEWLINE_DELIMITED_JSON
    assert loader._bq_client is None
Пример #25
0
def test_change_autodetect():
    loader = JsonDataLoader(table=None,
                            from_=None,
                            load_job_config=LoadJobConfig(),
                            bq_client=None)
    assert loader.load_job_config.autodetect is None
    loader = loader.autodetect()
    assert loader.load_job_config.autodetect is True
    loader = loader.autodetect(False)
    assert loader.load_job_config.autodetect is False
Пример #26
0
    def _process_responses_for_bigquery(self, responses: Sequence[dict],
                                        schema: Sequence[SchemaField],
                                        table_reference: TableReference):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='request_date')
        job_config.schema = schema

        for response in responses:
            if 'request_date' in response:
                response['request_date'] = response['request_date'].strftime(
                    '%Y-%m-%dT%H:%M:%S.%f')
            if 'date' in response:
                response['date'] = response['date'].strftime(
                    '%Y-%m-%dT%H:%M:%S.%f')

        load_job = self.bigquery.client.load_table_from_json(
            responses, table_reference, job_config=job_config)
        load_job.result()
Пример #27
0
    def test_ctor_w_config(self):
        from google.cloud.bigquery.schema import SchemaField
        from google.cloud.bigquery.job import LoadJobConfig

        client = _make_client(project=self.PROJECT)
        full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
        age = SchemaField("age", "INTEGER", mode="REQUIRED")
        config = LoadJobConfig()
        config.schema = [full_name, age]
        job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF,
                             client, config)
        self.assertEqual(job.schema, [full_name, age])
        config.destination_table_description = "Description"
        expected = {"description": "Description"}
        self.assertEqual(
            config._properties["load"]["destinationTableProperties"], expected)
        friendly_name = "Friendly Name"
        config._properties["load"]["destinationTableProperties"] = {
            "friendlyName": friendly_name
        }
        self.assertEqual(config.destination_table_friendly_name, friendly_name)
def test_change_write_disposition():
    loader = BaseDataLoader(table=None,
                            from_=None,
                            load_job_config=LoadJobConfig(),
                            bq_client=None)
    assert loader.load_job_config.write_disposition is None
    loader = loader.overwrite()
    assert loader.load_job_config.write_disposition == WriteDisposition.WRITE_TRUNCATE
    loader = loader.append()
    assert loader.load_job_config.write_disposition == WriteDisposition.WRITE_APPEND
    loader = loader.error_if_exists()
    assert loader.load_job_config.write_disposition == WriteDisposition.WRITE_EMPTY
Пример #29
0
    def _process_response_rows_for_bigquery(self, rows: Sequence[dict],
                                            methods: Sequence[dict],
                                            table_reference: TableReference):
        rows_dataframe = DataFrame.from_records(rows)
        rows_dataframe['date'] = rows_dataframe['date'].apply(
            lambda x: x.date())

        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')
        job_config.schema = [
            self._get_schema_for_field(column, methods)
            for column in list(rows_dataframe.columns.values)
        ]

        try:
            load_job = self.bigquery.client.load_table_from_dataframe(
                rows_dataframe, table_reference, job_config=job_config)

            load_job.result()
        except BadRequest as error:
            print(error.errors)
Пример #30
0
    def __init__(self,
                 *, table, partition: Optional[str] = None, from_: PackageFileLoader,
                 bq_client: Client, load_job_config: LoadJobConfig = LoadJobConfig()):
        """Constructor of DsvDataLoader.

        Args:
            table (Table): table to load data into.
            from_ (PackageFileLoader): specifies where data is.
            bq_client (Client): instance of bigquery client to use accross the DSL.
            partition (Optional[str], optional): if you plan to load into a specific partition. Used as a decorator.
                Defaults to None.
            load_job_config (LoadJobConfig, optional): Big Query load job config.
                This is the object updated by this DSL. Defaults to LoadJobConfig().
        """
        _load_job_config = deepcopy(load_job_config)
        _load_job_config.source_format = SourceFormat.CSV
        super().__init__(table=table, partition=partition, from_=from_,
                         bq_client=bq_client, load_job_config=_load_job_config)