예제 #1
0
    def import_csv(self,
                   bucket_name: str,
                   bucket_path: str,
                   dataset: str,
                   table: str,
                   sep: str = "\t") -> bool:
        logging.info(
            f"DataWarehouse.import_csv {bucket_path} to {dataset}.{table} ...")
        client = self._get_client()

        config = LoadJobConfig()
        config.autodetect = True
        config.field_delimiter = sep

        bucket_url = f"gs://{self.config.lake_path}/{bucket_path}"

        load_job = client.load_table_from_uri(bucket_url,
                                              f"{dataset}.{table}",
                                              job_config=config)
        result = load_job.result()

        logging.info(
            f"DataWarehouse.import_csv {bucket_path} to {dataset}.{table} Complete!"
        )

        return True
예제 #2
0
    def test_begin_w_alternate_client(self):
        from google.cloud.bigquery.job import CreateDisposition
        from google.cloud.bigquery.job import LoadJobConfig
        from google.cloud.bigquery.job import SchemaUpdateOption
        from google.cloud.bigquery.job import WriteDisposition
        from google.cloud.bigquery.schema import SchemaField

        PATH = "/projects/%s/jobs" % (self.PROJECT, )
        RESOURCE = self._make_resource(ended=True)
        LOAD_CONFIGURATION = {
            "sourceUris": [self.SOURCE1],
            "destinationTable": {
                "projectId": self.PROJECT,
                "datasetId": self.DS_ID,
                "tableId": self.TABLE_ID,
            },
            "allowJaggedRows": True,
            "allowQuotedNewlines": True,
            "createDisposition": CreateDisposition.CREATE_NEVER,
            "encoding": "ISO-8559-1",
            "fieldDelimiter": "|",
            "ignoreUnknownValues": True,
            "maxBadRecords": 100,
            "nullMarker": r"\N",
            "quote": "'",
            "skipLeadingRows": "1",
            "sourceFormat": "CSV",
            "useAvroLogicalTypes": True,
            "writeDisposition": WriteDisposition.WRITE_TRUNCATE,
            "schema": {
                "fields": [
                    {
                        "name": "full_name",
                        "type": "STRING",
                        "mode": "REQUIRED",
                        "description": None,
                    },
                    {
                        "name": "age",
                        "type": "INTEGER",
                        "mode": "REQUIRED",
                        "description": None,
                    },
                ]
            },
            "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION],
        }
        RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION
        conn1 = _make_connection()
        client1 = _make_client(project=self.PROJECT, connection=conn1)
        conn2 = _make_connection(RESOURCE)
        client2 = _make_client(project=self.PROJECT, connection=conn2)
        full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
        age = SchemaField("age", "INTEGER", mode="REQUIRED")
        config = LoadJobConfig()
        config.schema = [full_name, age]
        job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF,
                             client1, config)
        config.allow_jagged_rows = True
        config.allow_quoted_newlines = True
        config.create_disposition = CreateDisposition.CREATE_NEVER
        config.encoding = "ISO-8559-1"
        config.field_delimiter = "|"
        config.ignore_unknown_values = True
        config.max_bad_records = 100
        config.null_marker = r"\N"
        config.quote_character = "'"
        config.skip_leading_rows = 1
        config.source_format = "CSV"
        config.use_avro_logical_types = True
        config.write_disposition = WriteDisposition.WRITE_TRUNCATE
        config.schema_update_options = [
            SchemaUpdateOption.ALLOW_FIELD_ADDITION
        ]
        with mock.patch(
                "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
        ) as final_attributes:
            job._begin(client=client2)

        final_attributes.assert_called_with({"path": PATH}, client2, job)

        conn1.api_request.assert_not_called()
        self.assertEqual(len(conn2.api_request.call_args_list), 1)
        req = conn2.api_request.call_args_list[0]
        self.assertEqual(req[1]["method"], "POST")
        self.assertEqual(req[1]["path"], PATH)
        SENT = {
            "jobReference": {
                "projectId": self.PROJECT,
                "jobId": self.JOB_ID
            },
            "configuration": {
                "load": LOAD_CONFIGURATION
            },
        }
        self.maxDiff = None
        self.assertEqual(req[1]["data"], SENT)
        self._verifyResourceProperties(job, RESOURCE)