示例#1
0
    def get_table_reference_from_path(self, table_path):
        # type: (str) -> TableReference
        """
        Returns a TableReference for a given path to a BigQuery table.

        Args:
            table_path: A BigQuery table path in the form project.dataset.table

        Returns:
            A TableReference for the table specified by the path
        """
        _, dataset, table = self.parse_table_path(table_path)
        dataset_ref = DatasetReference(self.project_id, dataset)
        return TableReference(dataset_ref, table)
示例#2
0
 def source(self):
     """Union[ \
         google.cloud.bigquery.table.TableReference, \
         google.cloud.bigquery.model.ModelReference \
     ]: Table or Model from which data is to be loaded or extracted.
     """
     source_config = _helpers._get_sub_prop(
         self._properties, ["configuration", "extract", "sourceTable"])
     if source_config:
         return TableReference.from_api_repr(source_config)
     else:
         source_config = _helpers._get_sub_prop(
             self._properties, ["configuration", "extract", "sourceModel"])
         return ModelReference.from_api_repr(source_config)
示例#3
0
    def test_from_api_repr(self):
        from google.cloud.bigquery.dataset import DatasetReference
        from google.cloud.bigquery.table import TableReference
        dataset_ref = DatasetReference('project_1', 'dataset_1')
        expected = self._make_one(dataset_ref, 'table_1')

        got = TableReference.from_api_repr(
            {
                'projectId': 'project_1',
                'datasetId': 'dataset_1',
                'tableId': 'table_1',
            })

        self.assertEqual(expected, got)
示例#4
0
def test_to_gbq_w_default_project(mock_bigquery_client):
    """If no project is specified, we should be able to use project from
    default credentials.
    """
    import google.api_core.exceptions
    from google.cloud.bigquery.table import TableReference

    mock_bigquery_client.get_table.side_effect = (
        google.api_core.exceptions.NotFound("my_table"))
    gbq.to_gbq(DataFrame(), "my_dataset.my_table")

    mock_bigquery_client.get_table.assert_called_with(
        TableReference.from_string("default-project.my_dataset.my_table"))
    mock_bigquery_client.create_table.assert_called_with(mock.ANY)
    table = mock_bigquery_client.create_table.call_args[0][0]
    assert table.project == "default-project"
示例#5
0
    def get_schema(self, dataset_id, table_name, project_id=None):
        # type: (str, str, Optional[str]) -> List[SchemaField]
        """Returns the schema of a table.

        Args:
          dataset_id: The dataset to query.
          table_name: The name of the table.
          project_id: The project ID of the table.
        Returns:
          A list of SchemaFields representing the schema.
        """

        dataset_ref = DatasetReference(project_id if project_id else self.project_id, dataset_id)
        table = self.gclient.get_table(TableReference(dataset_ref, table_name))

        return table.schema
示例#6
0
 def drop_table_if_exists(self, table_name, schema_name, project_id=None):
     if self.test_if_table_exists(
             table_name=table_name,
             schema_name=schema_name,
             project_id=project_id,
     ):
         conn = self.dwh_hook.dbconn
         conn.delete_table(
             conn.get_table(
                 TableReference(
                     dataset_ref=FakeDatasetRef(
                         dataset_id=schema_name,
                         project_id=project_id or self.database_name,
                     ),
                     table_id=table_name,
                 )))
示例#7
0
    def _get_table(self, connection, table_name, schema=None):
        if isinstance(connection, Engine):
            connection = connection.connect()

        client = connection.connection._client

        project_id, dataset_id, table_id = self._split_table_name(table_name)
        project_id = project_id or client.project
        dataset_id = dataset_id or schema or self.dataset_id

        table_ref = TableReference.from_string("{}.{}.{}".format(
            project_id, dataset_id, table_id))
        try:
            table = client.get_table(table_ref)
        except NotFound:
            raise NoSuchTableError(table_name)
        return table
示例#8
0
    def __init__(self, table_name, dataset_name=None, project_name=None):
        self._project = project_name
        self._dataset = dataset_name
        self._table = table_name

        parts = table_name.replace(":", ".").split(".")
        if len(parts) == 3:  # project.dataset.table
            self._project = parts[0]
            self._dataset = parts[1]
            self._table = parts[2]
        elif len(parts) == 2:  # dataset.table
            self._dataset = parts[0]
            self._table = parts[1]

        self._dataset_ref = DatasetReference(dataset_id=self._dataset,
                                             project=self._project)
        self._table_ref = TableReference(dataset_ref=self._dataset_ref,
                                         table_id=self._table)
示例#9
0
    def sources(self):
        """List[google.cloud.bigquery.table.TableReference]): Table(s) from
        which data is to be loaded.
        """
        source_configs = _helpers._get_sub_prop(
            self._properties, ["configuration", "copy", "sourceTables"])
        if source_configs is None:
            single = _helpers._get_sub_prop(
                self._properties, ["configuration", "copy", "sourceTable"])
            if single is None:
                raise KeyError(
                    "Resource missing 'sourceTables' / 'sourceTable'")
            source_configs = [single]

        sources = []
        for source_config in source_configs:
            table_ref = TableReference.from_api_repr(source_config)
            sources.append(table_ref)
        return sources
示例#10
0
    def _table_reference(self, provided_schema_name, provided_table_name,
                         client_project):
        project_id_from_table, dataset_id_from_table, table_id = self._split_table_name(provided_table_name)
        project_id_from_schema = None
        dataset_id_from_schema = None
        if provided_schema_name is not None:
            provided_schema_name_split = provided_schema_name.split('.')
            if len(provided_schema_name_split) == 0:
                pass
            elif len(provided_schema_name_split) == 1:
                if dataset_id_from_table:
                    project_id_from_schema = provided_schema_name_split[0]
                else:
                    dataset_id_from_schema = provided_schema_name_split[0]
            elif len(provided_schema_name_split) == 2:
                project_id_from_schema = provided_schema_name_split[0]
                dataset_id_from_schema = provided_schema_name_split[1]
            else:
                raise ValueError("Did not understand schema: {}".format(provided_schema_name))
        if (dataset_id_from_schema and dataset_id_from_table and
           dataset_id_from_schema != dataset_id_from_table):
            raise ValueError(
                "dataset_id specified in schema and table_name disagree: "
                "got {} in schema, and {} in table_name".format(
                    dataset_id_from_schema, dataset_id_from_table
                )
            )
        if (project_id_from_schema and project_id_from_table and
           project_id_from_schema != project_id_from_table):
            raise ValueError(
                "project_id specified in schema and table_name disagree: "
                "got {} in schema, and {} in table_name".format(
                    project_id_from_schema, project_id_from_table
                )
            )
        project_id = project_id_from_schema or project_id_from_table or client_project
        dataset_id = dataset_id_from_schema or dataset_id_from_table or self.dataset_id

        table_ref = TableReference.from_string("{}.{}.{}".format(
            project_id, dataset_id, table_id
        ))
        return table_ref
示例#11
0
    def __load_many(self, dt_ref, tables, gcs_base_dir, file_format, jc,
                    preview):
        """
        :param tables:
        :param gcs_base_dir: to map to table
        """
        jobs = list()
        for tbl in tables:
            data_uri = "{}/{}/*.{}".format(gcs_base_dir, tbl, file_format)
            table_ref = TableReference(dataset_ref=dt_ref.dataset_ref,
                                       table_id=tbl)
            print("--  {}{} <= {} ".format("preview: " if preview else "", tbl,
                                           data_uri))
            if preview:
                continue

            jobs.append(
                self.connect(dt_ref.project).load_table_from_uri(
                    data_uri, table_ref, job_config=jc))
        self.__check_jobs(jobs)
示例#12
0
def test_to_gbq_w_project_table(mock_bigquery_client):
    """If a project is included in the table ID, use that instead of the client
    project. See: https://github.com/pydata/pandas-gbq/issues/321
    """
    import google.api_core.exceptions
    from google.cloud.bigquery.table import TableReference

    mock_bigquery_client.get_table.side_effect = (
        google.api_core.exceptions.NotFound("my_table"))
    gbq.to_gbq(
        DataFrame(),
        "project_table.my_dataset.my_table",
        project_id="project_client",
    )

    mock_bigquery_client.get_table.assert_called_with(
        TableReference.from_string("project_table.my_dataset.my_table"))
    mock_bigquery_client.create_table.assert_called_with(mock.ANY)
    table = mock_bigquery_client.create_table.call_args[0][0]
    assert table.project == "project_table"
    def _bq_get_data(self):

        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )
        table_ref = TableReference.from_string(
            self.source_project_dataset_table)
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id,
                      table_ref.table_id)

        conn = hook.get_conn()
        cursor = conn.cursor()
        i = 0
        while True:
            response = cursor.get_tabledata(
                dataset_id=table_ref.dataset_id,
                table_id=table_ref.table_id,
                max_results=self.batch_size,
                selected_fields=self.selected_fields,
                start_index=i * self.batch_size,
            )

            if 'rows' not in response:
                self.log.info('Job Finished')
                return

            rows = response['rows']

            self.log.info('Total Extracted rows: %s',
                          len(rows) + i * self.batch_size)

            table_data = []
            table_data = [[fields['v'] for fields in dict_row['f']]
                          for dict_row in rows]

            yield table_data
            i += 1
    def test_run_async_extract_job_submitsExtractJobAndReturnsJobIdWithProperConfig(
            self, bigquery_module_patch: bigquery):
        project_id = "some-project-id"
        table = "some-project.some-dataset.some-table"
        destination_uris = [
            "gs://some-source-uri/to_object1",
            "gs://some-source-uri/to_object2"
        ]
        job_prefix = "some_job_prefix"
        bigquery_module_patch.Client.return_value = self.client_mock

        expected_job_id = self.JOB_ID
        self.extract_job_mock = Mock(ExtractJob)
        self.extract_job_mock.job_id = expected_job_id
        self.client_mock.extract_table.return_value = self.extract_job_mock
        ems_job_config = EmsExtractJobConfig(
            compression=Compression.GZIP,
            destination_format=DestinationFormat.CSV,
            field_delimiter="Deli mit R",
            print_header=True,
            labels={"label1": "label1_value"})

        ems_bigquery_client = EmsBigqueryClient(project_id, "Emelet")
        result_job_id = ems_bigquery_client.run_async_extract_job(
            job_id_prefix=job_prefix,
            table=table,
            destination_uris=destination_uris,
            job_config=ems_job_config)
        call_args_list = self.client_mock.extract_table.call_args_list
        args = call_args_list[0][1]

        assert args["location"] == "Emelet"
        assert args["source"] == TableReference.from_string(table_id=table)
        assert args["job_id_prefix"] == job_prefix
        assert args["destination_uris"] == destination_uris
        assert args["job_config"].compression == "GZIP"
        assert args["job_config"].destination_format == "CSV"
        assert args["job_config"].field_delimiter == "Deli mit R"
        assert args["job_config"].print_header == True
        assert args["job_config"].labels == {"label1": "label1_value"}
        assert result_job_id == expected_job_id
示例#15
0
    def create_tables_from_dict(
            self,
            table_names_to_schemas,  # type: Dict[str, List[SchemaField]]
            dataset_id=None,  # type: Optional[str]
            replace_existing_tables=False,  # type: Optional[bool]
    ):
        # type: (...) -> None
        """Creates a set of tables from a dictionary of table names to their schemas.

        Args:
          table_names_to_schemas: A dictionary of:
            key: The table name.
            value: A list of SchemaField objects.
          dataset_id: The dataset in which to create tables. If not specified, use default dataset.
          replace_existing_tables: If True, delete and re-create tables. Otherwise, checks to see
              if any of the requested tables exist. If they do, it will raise a RuntimeError.

        Raises:
            RuntimeError if replace_existing_tables is False and any of the tables requested for
                creation already exist
        """
        dataset_id = dataset_id or self.default_dataset_id
        dataset_ref = DatasetReference(self.project_id, dataset_id)

        # If the flag isn't set to replace existing tables, raise an error if any tables we're
        # trying to create already exist.
        if not replace_existing_tables:
            self._raise_if_tables_exist(table_names_to_schemas.keys(),
                                        dataset_id)

        for name, schema in six.iteritems(table_names_to_schemas):
            table_ref = TableReference(dataset_ref, name)
            # Use the Table object so it retains its schema.
            table = bigquery.Table(table_ref, schema=schema)

            if self.table_exists(table) and replace_existing_tables:
                self.delete_table(table)
            self.create_table(table)
 def __create_extract_job_mock(self,
                               job_id: str,
                               table: str,
                               has_error: bool,
                               created: datetime = datetime.now()):
     error_result = {
         'reason': 'someReason',
         'location': 'query',
         'message': 'error occurred'
     }
     extract_job_mock = Mock(ExtractJob)
     extract_job_mock.job_id = job_id
     extract_job_mock.destination_uris = ["uri1"]
     extract_job_mock.labels = {"label1": "label1_value"}
     extract_job_mock.source = TableReference.from_string(table)
     extract_job_mock.compression = None
     extract_job_mock.field_delimiter = ","
     extract_job_mock.print_header = True
     extract_job_mock.destination_format = "CSV"
     extract_job_mock.state = "DONE"
     extract_job_mock.error_result = error_result if has_error else None
     extract_job_mock.created = created
     return extract_job_mock
示例#17
0
    def execute(self, context):
        self.log.info(
            'Executing extract of %s into: %s',
            self.source_project_dataset_table,
            self.destination_cloud_storage_uris,
        )
        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        table_ref = TableReference.from_string(
            self.source_project_dataset_table, hook.project_id)

        configuration: Dict[str, Any] = {
            'extract': {
                'sourceTable': table_ref.to_api_repr(),
                'compression': self.compression,
                'destinationUris': self.destination_cloud_storage_uris,
                'destinationFormat': self.export_format,
            }
        }

        if self.labels:
            configuration['labels'] = self.labels

        if self.export_format == 'CSV':
            # Only set fieldDelimiter and printHeader fields if using CSV.
            # Google does not like it if you set these fields for other export
            # formats.
            configuration['extract']['fieldDelimiter'] = self.field_delimiter
            configuration['extract']['printHeader'] = self.print_header

        hook.insert_job(configuration=configuration)
示例#18
0
class _Base(unittest.TestCase):
    from google.cloud.bigquery.dataset import DatasetReference
    from google.cloud.bigquery.table import TableReference

    ENDPOINT = "https://bigquery.googleapis.com"
    PROJECT = "project"
    SOURCE1 = "http://example.com/source1.csv"
    DS_ID = "dataset_id"
    DS_REF = DatasetReference(PROJECT, DS_ID)
    TABLE_ID = "table_id"
    TABLE_REF = TableReference(DS_REF, TABLE_ID)
    JOB_ID = "JOB_ID"
    JOB_TYPE = "unknown"
    KMS_KEY_NAME = "projects/1/locations/us/keyRings/1/cryptoKeys/1"

    def _make_one(self, *args, **kw):
        return self._get_target_class()(*args, **kw)

    def _setUpConstants(self):
        import datetime
        from google.cloud._helpers import UTC

        self.WHEN_TS = 1437767599.006
        self.WHEN = datetime.datetime.utcfromtimestamp(
            self.WHEN_TS).replace(tzinfo=UTC)
        self.ETAG = "ETAG"
        self.FULL_JOB_ID = "%s:%s" % (self.PROJECT, self.JOB_ID)
        self.RESOURCE_URL = "{}/bigquery/v2/projects/{}/jobs/{}".format(
            self.ENDPOINT, self.PROJECT, self.JOB_ID)
        self.USER_EMAIL = "*****@*****.**"

    def _table_ref(self, table_id):
        from google.cloud.bigquery.table import TableReference

        return TableReference(self.DS_REF, table_id)

    def _make_resource(self, started=False, ended=False, location="US"):
        self._setUpConstants()
        return _make_job_resource(
            creation_time_ms=int(self.WHEN_TS * 1000),
            started_time_ms=int(self.WHEN_TS * 1000),
            ended_time_ms=int(self.WHEN_TS * 1000) + 1000000,
            started=started,
            ended=ended,
            etag=self.ETAG,
            endpoint=self.ENDPOINT,
            job_type=self.JOB_TYPE,
            job_id=self.JOB_ID,
            project_id=self.PROJECT,
            user_email=self.USER_EMAIL,
            location=location,
        )

    def _verifyInitialReadonlyProperties(self, job):
        # root elements of resource
        self.assertIsNone(job.etag)
        self.assertIsNone(job.self_link)
        self.assertIsNone(job.user_email)

        # derived from resource['statistics']
        self.assertIsNone(job.created)
        self.assertIsNone(job.started)
        self.assertIsNone(job.ended)

        # derived from resource['status']
        self.assertIsNone(job.error_result)
        self.assertIsNone(job.errors)
        self.assertIsNone(job.state)

    def _verifyReadonlyResourceProperties(self, job, resource):
        from datetime import timedelta

        statistics = resource.get("statistics", {})

        if "creationTime" in statistics:
            self.assertEqual(job.created, self.WHEN)
        else:
            self.assertIsNone(job.created)

        if "startTime" in statistics:
            self.assertEqual(job.started, self.WHEN)
        else:
            self.assertIsNone(job.started)

        if "endTime" in statistics:
            self.assertEqual(job.ended, self.WHEN + timedelta(seconds=1000))
        else:
            self.assertIsNone(job.ended)

        if "etag" in resource:
            self.assertEqual(job.etag, self.ETAG)
        else:
            self.assertIsNone(job.etag)

        if "selfLink" in resource:
            self.assertEqual(job.self_link, self.RESOURCE_URL)
        else:
            self.assertIsNone(job.self_link)

        if "user_email" in resource:
            self.assertEqual(job.user_email, self.USER_EMAIL)
        else:
            self.assertIsNone(job.user_email)
示例#19
0
    assert location == "some-location"
    assert dataset_id == "some-dataset"
    assert arraysize == 1000
    assert credentials_path == "/some/path/to.json"
    assert isinstance(job_config, QueryJobConfig)


@pytest.mark.parametrize(
    "param, value, default",
    [
        ("clustering_fields", ["a", "b", "c"], None),
        ("create_disposition", "CREATE_IF_NEEDED", None),
        (
            "destination",
            TableReference(
                DatasetReference("different-project", "different-dataset"), "table"
            ),
            None,
        ),
        (
            "destination_encryption_configuration",
            lambda enc: enc.kms_key_name
            == EncryptionConfiguration("some-configuration").kms_key_name,
            None,
        ),
        ("dry_run", True, None),
        ("labels", {"a": "b", "c": "d"}, {}),
        ("maximum_bytes_billed", 1000, None),
        ("priority", "INTERACTIVE", None),
        (
            "schema_update_options",
示例#20
0
        def final_func(schema_name, schema_suffix, dwh_conn_id):
            # final: move new data into the final dataset
            conn = EWAHBaseHook.get_hook_from_conn_id(dwh_conn_id).dbconn
            # get dataset objects
            try:  # create final dataset if not exists
                ds_final = conn.get_dataset(schema_name)
            except:
                print("Creating dataset {0}".format(schema_name))
                ds_final = conn.create_dataset(schema_name)
            ds_temp = conn.get_dataset(schema_name + schema_suffix)

            # copy all tables from temp dataset to final dataset
            new_tables = conn.list_tables(ds_temp)
            new_table_ids = [
                table.table_id for table in conn.list_tables(ds_temp)
            ]
            old_table_ids = [
                table.table_id for table in conn.list_tables(ds_final)
            ]
            copy_jobs = []
            for table in new_tables:
                print("Copying table {0} from temp to final dataset".format(
                    table.table_id))
                try:
                    old_table = conn.get_table(table=TableReference(
                        dataset_ref=ds_final, table_id=table.table_id))
                    conn.delete_table(old_table)
                except:
                    # ignore failure, fails if old table does not exist to begin with
                    pass
                finally:
                    final_table = ds_final.table(table.table_id)
                    copy_jobs.append(conn.copy_table(table, final_table))

            # delete tables that don't exist in temp dataset from final dataset
            for table_id in old_table_ids:
                if not table_id in new_table_ids:
                    print("Deleting table {0}".format(table_id))
                    conn.delete_table(
                        conn.get_table(
                            TableReference(dataset_ref=ds_final,
                                           table_id=table_id)))

            # make sure all copy jobs succeeded
            while copy_jobs:
                sleep(0.1)
                job = copy_jobs.pop(0)
                job.result()
                assert job.state in ("RUNNING", "DONE")
                if job.state == "RUNNING":
                    copy_jobs.append(job)
                else:
                    print("Successfully copied {0}".format(
                        job.__dict__["_properties"]["configuration"]["copy"]
                        ["destinationTable"]["tableId"]))

            # delete temp dataset
            print("Deleting temp dataset.")
            conn.delete_dataset(ds_temp,
                                delete_contents=True,
                                not_found_ok=False)

            print("Done.")
示例#21
0
    def commit(self):
        # The commit is where the upload is actually done for BigQuery (special case).
        # The _create_or_update_table method can be called multiple times;
        # each time, data is appended to the .avro file. When "committing",
        # this .avro file is uploaded and, depending on the load strategy, used.
        if not hasattr(self, "avro_file_name"):
            # There was no data ever uploaded
            # Do nothing
            self.log.info("Nothing to upload!")
            return

        # Clean up after yourself first
        self.avro_writer.close()

        # Fetch the relevant configuration
        project_id = self.table_creation_config.get("database_name",
                                                    self.database_name)
        assert project_id, "Missing Project ID!"
        load_strategy = self.table_creation_config["load_strategy"]
        primary_key = self.table_creation_config["primary_key"]
        schema_name = self.table_creation_config["schema_name"]
        schema_suffix = self.table_creation_config["schema_suffix"]
        table_name_final = self.table_creation_config["table_name"]
        table_suffix = "__ewah_tmp"

        columns_definition = self.table_creation_config["columns_definition"]
        new_schema_name = schema_name + schema_suffix

        is_full_refresh = (load_strategy == EC.LS_INSERT_REPLACE
                           or not self.test_if_table_exists(
                               table_name=table_name_final,
                               schema_name=new_schema_name,
                               project_id=project_id,
                           ))

        conn = self.dwh_hook.dbconn
        ds_new = conn.get_dataset(new_schema_name)

        # Create temp table with .avro file
        if is_full_refresh:
            # temp table is also the final table for full refresh!
            table_name = table_name_final
        else:
            table_name = table_name_final + table_suffix

        # Drop temp table if it already exists
        if self.test_if_table_exists(
                table_name=table_name,
                schema_name=new_schema_name,
                project_id=project_id,
        ):
            # Drop table before re-creating it
            conn.delete_table(
                conn.get_table(
                    TableReference(dataset_ref=ds_new, table_id=table_name)))
        # Create temp table with .avro file
        table_obj = Table(".".join([project_id, new_schema_name, table_name]))
        if is_full_refresh and self.partition_field:
            table_obj.time_partitioning = bigquery.TimePartitioning(
                type_=self.partition_type,
                field=self.partition_field,
            )
            if self.require_partition_filter:
                table_obj.require_partition_filter = True
        self.log.info("Uploading data into table now...")
        with open(self.avro_file_name, "rb") as source_file:
            job = conn.load_table_from_file(
                file_obj=source_file,
                destination=table_obj,
                job_id_prefix="ewah_",
                rewind=True,
                job_config=LoadJobConfig(
                    autodetect=False,
                    source_format="AVRO",
                    schema=[
                        SchemaField(name=name, field_type=field["data_type"])
                        for name, field in columns_definition.items()
                    ],
                ),
            )
            try:
                job.result()
            except:
                self.log.info("Errors occured - job errors: {0}".format(
                    job.errors))
                raise
            assert job.state == "DONE", "Invalid job state: {0}".format(
                job.state)

        if not is_full_refresh:
            # Need to merge new rows into the existing table

            fields_pk = set(primary_key or [])
            fields_all = set(columns_definition.keys() or [])
            fields_non_pk = fields_all - fields_pk

            if load_strategy == EC.LS_UPSERT:
                assert fields_pk
            elif load_strategy == EC.LS_INSERT_ADD:
                fields_pk = []  # Ignore if set
            else:
                raise Exception("Not implemented!")

            merge_statement = """
                MERGE INTO `{target}` AS TARGET
                USING `{source}` AS SOURCE
                ON {condition}

                WHEN MATCHED THEN
                    UPDATE SET {update_fields}

                WHEN NOT MATCHED THEN
                    INSERT ({insert_fields})
                    VALUES ({insert_fields})
            """.format(
                target=".".join(
                    [project_id, new_schema_name, table_name_final]),
                source=".".join([project_id, new_schema_name, table_name]),
                condition=" AND ".join([
                    "TARGET.`{0}` = SOURCE.`{0}`".format(field)
                    for field in fields_pk
                ]) or "FALSE",
                insert_fields="`{0}`".format("`, `".join(fields_all)),
                update_fields=", ".join([
                    "`{0}` = SOURCE.`{0}`".format(field)
                    for field in fields_non_pk
                ]),
            )

            self.log.info(
                "Executing query:\n\n{0}\n\n".format(merge_statement))
            job = conn.query(
                query=merge_statement,
                job_id_prefix="ewah_",
            )
            try:
                job.result()
            except:
                self.log.info("Errors occured - job errors: {0}".format(
                    job.errors))
                raise
            assert job.state == "DONE", "Invalid job state: {0}".format(
                job.state)

            # Remove old temp table from dataset
            conn.delete_table(
                conn.get_table(
                    TableReference(dataset_ref=ds_new, table_id=table_name)))

        self.log.info("Done!")
示例#22
0
def bq_insert(rows: List):
    """
    Inserts rows into BigQuery
    :param rows: list of dictionaries which are representing rows
    :return:
    """
    from google.cloud import bigquery

    if not rows:
        logging.error("no rows to upload")
        return
    bq = bigquery.Client(project=GCP_PROJECT)
    table_ref = TableReference.from_string(
        f"{GCP_PROJECT}.live.om_state_latencies")

    schema = [
        {
            "name": "date",
            "type": "DATE"
        },
        {
            "name": "sym",
            "type": "STRING"
        },
        {
            "name": "from_state",
            "type": "STRING"
        },
        {
            "name": "to_state",
            "type": "STRING"
        },
        {
            "name": "count",
            "type": "INTEGER"
        },
        {
            "name": "average",
            "type": "FLOAT"
        },
        {
            "name": "percentile_10",
            "type": "FLOAT"
        },
        {
            "name": "percentile_50",
            "type": "FLOAT"
        },
        {
            "name": "percentile_90",
            "type": "FLOAT"
        },
        {
            "name": "percentile_99",
            "type": "FLOAT"
        },
        {
            "name": "percentile_99_99",
            "type": "FLOAT"
        },
    ]

    table = Table(table_ref)
    table.schema = schema
    table = bq.create_table(table, exists_ok=True)
    logging.info("inserting {} rows".format(len(rows)))
    res = bq.insert_rows(table, rows)
    logging.info(res)
示例#23
0
    def _table_ref(self, table_id):
        from google.cloud.bigquery.table import TableReference

        return TableReference(self.DS_REF, table_id)
def parse_url(url):  # noqa: C901
    query = dict(url.query)  # need mutable query.

    # use_legacy_sql (legacy)
    if "use_legacy_sql" in query:
        raise ValueError("legacy sql is not supported by this dialect")
    # allow_large_results (legacy)
    if "allow_large_results" in query:
        raise ValueError(
            "allow_large_results is only allowed for legacy sql, which is not supported by this dialect"
        )
    # flatten_results (legacy)
    if "flatten_results" in query:
        raise ValueError(
            "flatten_results is only allowed for legacy sql, which is not supported by this dialect"
        )
    # maximum_billing_tier (deprecated)
    if "maximum_billing_tier" in query:
        raise ValueError("maximum_billing_tier is a deprecated argument")

    project_id = url.host
    location = None
    dataset_id = url.database or None
    arraysize = None
    credentials_path = None

    # location
    if "location" in query:
        location = query.pop("location")

    # credentials_path
    if "credentials_path" in query:
        credentials_path = query.pop("credentials_path")

    # arraysize
    if "arraysize" in query:
        str_arraysize = query.pop("arraysize")
        try:
            arraysize = int(str_arraysize)
        except ValueError:
            raise ValueError("invalid int in url query arraysize: " +
                             str_arraysize)

    # if only these "non-config" values were present, the dict will now be empty
    if not query:
        # if a dataset_id exists, we need to return a job_config that isn't None
        # so it can be updated with a dataset reference from the client
        if dataset_id:
            return (
                project_id,
                location,
                dataset_id,
                arraysize,
                credentials_path,
                QueryJobConfig(),
            )
        else:
            return project_id, location, dataset_id, arraysize, credentials_path, None

    job_config = QueryJobConfig()

    # clustering_fields list(str)
    if "clustering_fields" in query:
        clustering_fields = GROUP_DELIMITER.split(query["clustering_fields"])
        job_config.clustering_fields = list(clustering_fields)

    # create_disposition
    if "create_disposition" in query:
        create_disposition = query["create_disposition"]
        try:
            job_config.create_disposition = getattr(CreateDisposition,
                                                    create_disposition)
        except AttributeError:
            raise ValueError("invalid create_disposition in url query: " +
                             create_disposition)

    # default_dataset
    if "default_dataset" in query or "dataset_id" in query or "project_id" in query:
        raise ValueError(
            "don't pass default_dataset, dataset_id, project_id in url query, instead use the url host and database"
        )

    # destination
    if "destination" in query:
        dest_project = None
        dest_dataset = None
        dest_table = None

        try:
            dest_project, dest_dataset, dest_table = query[
                "destination"].split(".")
        except ValueError:
            raise ValueError(
                "url query destination parameter should be fully qualified with project, dataset, and table"
            )

        job_config.destination = TableReference(
            DatasetReference(dest_project, dest_dataset), dest_table)

    # destination_encryption_configuration
    if "destination_encryption_configuration" in query:
        job_config.destination_encryption_configuration = EncryptionConfiguration(
            query["destination_encryption_configuration"])

    # dry_run
    if "dry_run" in query:
        try:
            job_config.dry_run = parse_boolean(query["dry_run"])
        except ValueError:
            raise ValueError("invalid boolean in url query for dry_run: " +
                             query["dry_run"])

    # labels
    if "labels" in query:
        label_groups = GROUP_DELIMITER.split(query["labels"])
        labels = {}
        for label_group in label_groups:
            try:
                key, value = KEY_VALUE_DELIMITER.split(label_group)
            except ValueError:
                raise ValueError("malformed url query in labels: " +
                                 label_group)
            labels[key] = value

        job_config.labels = labels

    # maximum_bytes_billed
    if "maximum_bytes_billed" in query:
        try:
            job_config.maximum_bytes_billed = int(
                query["maximum_bytes_billed"])
        except ValueError:
            raise ValueError(
                "invalid int in url query maximum_bytes_billed: " +
                query["maximum_bytes_billed"])

    # priority
    if "priority" in query:
        try:
            job_config.priority = getattr(QueryPriority, query["priority"])
        except AttributeError:
            raise ValueError("invalid priority in url query: " +
                             query["priority"])

    # query_parameters
    if "query_parameters" in query:
        raise NotImplementedError("url query query_parameters not implemented")

    # schema_update_options
    if "schema_update_options" in query:
        schema_update_options = GROUP_DELIMITER.split(
            query["schema_update_options"])
        try:
            job_config.schema_update_options = [
                getattr(SchemaUpdateOption, schema_update_option)
                for schema_update_option in schema_update_options
            ]
        except AttributeError:
            raise ValueError("invalid schema_update_options in url query: " +
                             query["schema_update_options"])

    # table_definitions
    if "table_definitions" in query:
        raise NotImplementedError(
            "url query table_definitions not implemented")

    # time_partitioning
    if "time_partitioning" in query:
        raise NotImplementedError(
            "url query time_partitioning not implemented")

    # udf_resources
    if "udf_resources" in query:
        raise NotImplementedError("url query udf_resources not implemented")

    # use_query_cache
    if "use_query_cache" in query:
        try:
            job_config.use_query_cache = parse_boolean(
                query["use_query_cache"])
        except ValueError:
            raise ValueError(
                "invalid boolean in url query for use_query_cache: " +
                query["use_query_cache"])

    # write_disposition
    if "write_disposition" in query:
        try:
            job_config.write_disposition = getattr(WriteDisposition,
                                                   query["write_disposition"])
        except AttributeError:
            raise ValueError("invalid write_disposition in url query: " +
                             query["write_disposition"])

    return project_id, location, dataset_id, arraysize, credentials_path, job_config
def test_basic(url_with_everything):
    project_id, location, dataset_id, arraysize, credentials_path, job_config = parse_url(url_with_everything)

    assert project_id == 'some-project'
    assert location == 'some-location'
    assert dataset_id == 'some-dataset'
    assert arraysize == 1000
    assert credentials_path == '/some/path/to.json'
    assert isinstance(job_config, QueryJobConfig)


@pytest.mark.parametrize('param, value', [
    ('clustering_fields', ['a', 'b', 'c']),
    ('create_disposition', 'CREATE_IF_NEEDED'),
    ('destination', TableReference(DatasetReference('different-project', 'different-dataset'), 'table')),
    ('destination_encryption_configuration',
     lambda enc: enc.kms_key_name == EncryptionConfiguration('some-configuration').kms_key_name),
    ('dry_run', True),
    ('labels', {'a': 'b', 'c': 'd'}),
    ('maximum_bytes_billed', 1000),
    ('priority', 'INTERACTIVE'),
    ('schema_update_options', ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']),
    ('use_query_cache', True),
    ('write_disposition', 'WRITE_APPEND'),
])
def test_all_values(url_with_everything, param, value):
    job_config = parse_url(url_with_everything)[5]

    config_value = getattr(job_config, param)
    if callable(value):
示例#26
0
    project_id, location, dataset_id, arraysize, credentials_path, job_config = parse_url(
        url_with_everything)

    assert project_id == 'some-project'
    assert location == 'some-location'
    assert dataset_id == 'some-dataset'
    assert arraysize == 1000
    assert credentials_path == '/some/path/to.json'
    assert isinstance(job_config, QueryJobConfig)


@pytest.mark.parametrize('param, value', [
    ('clustering_fields', ['a', 'b', 'c']),
    ('create_disposition', 'CREATE_IF_NEEDED'),
    ('destination',
     TableReference(DatasetReference('different-project', 'different-dataset'),
                    'table')),
    ('destination_encryption_configuration', lambda enc: enc.kms_key_name ==
     EncryptionConfiguration('some-configuration').kms_key_name),
    ('dry_run', True),
    ('labels', {
        'a': 'b',
        'c': 'd'
    }),
    ('maximum_bytes_billed', 1000),
    ('priority', 'INTERACTIVE'),
    ('schema_update_options',
     ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']),
    ('use_query_cache', True),
    ('write_disposition', 'WRITE_APPEND'),
])
def test_all_values(url_with_everything, param, value):