def run_quickstart(override_values={}):
    # [START bigquerydatatransfer_quickstart]
    from google.cloud import bigquery_datatransfer

    client = bigquery_datatransfer.DataTransferServiceClient()

    # TODO: Update to your project ID.
    project_id = "my-project"
    # [END bigquerydatatransfer_quickstart]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    project_id = override_values.get("project_id", project_id)
    # [START bigquerydatatransfer_quickstart]

    # Get the full path to your project.
    parent = client.common_project_path(project_id)

    print("Supported Data Sources:")

    # Iterate over all possible data sources.
    for data_source in client.list_data_sources(parent=parent):
        print("{}:".format(data_source.display_name))
        print("\tID: {}".format(data_source.data_source_id))
        print("\tFull path: {}".format(data_source.name))
        print("\tDescription: {}".format(data_source.description))
Пример #2
0
def scheduled_query(transfer_client, dataset_id, table_id, query_string,
                    schedule):

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    # The project where the query job runs is the same as the project
    # containing the destination dataset.
    project_id = transfer_client.project

    parent = transfer_client.common_project_path(project_id)

    transfer_config = bigquery_datatransfer.TransferConfig(
        destination_dataset_id=dataset_id,
        display_name="Your Scheduled Query Name",
        data_source_id="scheduled_query",
        params={
            "query": query_string,
            "destination_table_name_template": table_id,
            "write_disposition": "WRITE_APPEND",
            "partitioning_field": "",
        },
        schedule=schedule)

    transfer_config = transfer_client.create_transfer_config(
        bigquery_datatransfer.CreateTransferConfigRequest(
            parent=parent,
            transfer_config=transfer_config,
        ))

    print("Created scheduled query '{}'".format(transfer_config.name))
def update_config(override_values={}):
    # [START bigquerydatatransfer_update_config]
    from google.cloud import bigquery_datatransfer
    from google.protobuf import field_mask_pb2

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd"
    new_display_name = "My Transfer Config"
    # [END bigquerydatatransfer_update_config]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    new_display_name = override_values.get("new_display_name",
                                           new_display_name)
    transfer_config_name = override_values.get("transfer_config_name",
                                               transfer_config_name)
    # [START bigquerydatatransfer_update_config]

    transfer_config = bigquery_datatransfer.TransferConfig(
        name=transfer_config_name)
    transfer_config.display_name = new_display_name

    transfer_config = transfer_client.update_transfer_config({
        "transfer_config":
        transfer_config,
        "update_mask":
        field_mask_pb2.FieldMask(paths=["display_name"]),
    })

    print(f"Updated config: '{transfer_config.name}'")
    print(f"New display name: '{transfer_config.display_name}'")
    # [END bigquerydatatransfer_update_config]
    # Return the config name for testing purposes, so that it can be deleted.
    return transfer_config
def run_notification(transfer_config_name, pubsub_topic):
    orig_transfer_config_name = transfer_config_name
    orig_pubsub_topic = pubsub_topic
    # [START bigquerydatatransfer_run_notification]
    transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd"
    pubsub_topic = "projects/PROJECT-ID/topics/TOPIC-ID"
    # [END bigquerydatatransfer_run_notification]
    transfer_config_name = orig_transfer_config_name
    pubsub_topic = orig_pubsub_topic

    # [START bigquerydatatransfer_run_notification]
    from google.cloud import bigquery_datatransfer
    from google.protobuf import field_mask_pb2

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    transfer_config = bigquery_datatransfer.TransferConfig(
        name=transfer_config_name)
    transfer_config.notification_pubsub_topic = pubsub_topic
    update_mask = field_mask_pb2.FieldMask(paths=["notification_pubsub_topic"])

    transfer_config = transfer_client.update_transfer_config({
        "transfer_config":
        transfer_config,
        "update_mask":
        update_mask
    })

    print(f"Updated config: '{transfer_config.name}'")
    print(
        f"Notification Pub/Sub topic: '{transfer_config.notification_pubsub_topic}'"
    )
    # [END bigquerydatatransfer_run_notification]
    # Return the config name for testing purposes, so that it can be deleted.
    return transfer_config
def update_credentials_with_service_account(override_values={}):
    # [START bigquerydatatransfer_update_credentials]
    from google.cloud import bigquery_datatransfer
    from google.protobuf import field_mask_pb2

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    service_account_name = "*****@*****.**"
    transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd"
    # [END bigquerydatatransfer_update_credentials]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    service_account_name = override_values.get("service_account_name",
                                               service_account_name)
    transfer_config_name = override_values.get("transfer_config_name",
                                               transfer_config_name)
    # [START bigquerydatatransfer_update_credentials]

    transfer_config = bigquery_datatransfer.TransferConfig(
        name=transfer_config_name)

    transfer_config = transfer_client.update_transfer_config({
        "transfer_config":
        transfer_config,
        "update_mask":
        field_mask_pb2.FieldMask(paths=["service_account_name"]),
        "service_account_name":
        service_account_name,
    })

    print("Updated config: '{}'".format(transfer_config.name))
    # [END bigquerydatatransfer_update_credentials]
    # Return the config name for testing purposes, so that it can be deleted.
    return transfer_config
Пример #6
0
def test_list_data_sources(project_id):
    client = bigquery_datatransfer.DataTransferServiceClient()

    parent = client.common_project_path(project_id)
    data_sources = list(client.list_data_sources(parent=parent))

    assert len(data_sources) >= 0
Пример #7
0
    def ScheduledQueriesCheck(self,
                              scheduled_queries_name,
                              project_id='benq-data-etl'):
        '''
        purpose                      : used for checking state of scheduled queries
        param scheduled_queries_name : display name of scheduled queries
        param project_id             : id of project with default = benq-data-etl
        return dictionary            : is_check_succeed => (boolean : required), is_query_execute_succeed => (boolean : required)
        '''

        starting_time = datetime.now()
        is_check_succeed = False
        is_query_execute_succeed = False

        try:
            transfer_client = bigquery_datatransfer.DataTransferServiceClient()

            parent = transfer_client.common_project_path(project_id)

            configs = transfer_client.list_transfer_configs(parent=parent)

            config = next(
                filter(lambda x: x.display_name == scheduled_queries_name,
                       configs), None)

            if config:

                is_query_execute_succeed = True if config.state.name == 'SUCCEEDED' else False

                is_check_succeed = True

            else:
                error_log = 'config of {} not found'.format(
                    scheduled_queries_name)

        except Exception as E:

            error_log = str(E)

        ending_time = datetime.now()

        message = '{} to check state of scheduled queries: {}{} using time: {}'.format(
            'Succeeded' if is_check_succeed else 'Failed',
            scheduled_queries_name,
            '' if is_check_succeed else ' due to {}'.format(error_log),
            ending_time - starting_time)

        print(message + '\n\n')

        self.PostMessage(
            self.slack_channel, message, '{}_Log_BigQuery'.format(
                'Correct' if is_check_succeed else 'Error'))

        return {
            'is_check_succeed': is_check_succeed,
            'is_query_execute_succeed': is_query_execute_succeed
        }
def transfer_client(default_credentials, project_id):
    credentials, _ = default_credentials
    options = client_options.ClientOptions(quota_project_id=project_id)

    transfer_client = bigquery_datatransfer.DataTransferServiceClient(
        credentials=credentials, client_options=options)

    # Ensure quota is always attributed to the correct project.
    bigquery_datatransfer.DataTransferServiceClient = lambda: transfer_client

    return transfer_client
    def load_transfers(self, cli: bigquery.Client, project: str, advertiser):
        """
        Bootstrap step to create BigQuery data transfers.

        :param cli: BigQuery client instance
        :param project: Name of the project
        :param advertiser: Numeric value of an SA360 advertiser
        :return: The result of the client transfer configuration.
        """
        client = bigquery_datatransfer.DataTransferServiceClient()
        location = self.s.unwrap('location')
        project = self.s.unwrap('gcp_project_name')
        data_source = 'doubleclick_search'
        name = client.location_data_source_path(project, location, data_source)
        parent = client.location_path(project, location)
        params = Struct()
        display_name= 'SA360 Transfer {}'.format(advertiser)
        config = Bootstrap.config_exists(client, parent, display_name)
        if config is not None:
            print('start transfer')
            cprint(
                'Schedule already exists for {}. Skipping'.format(advertiser),
                'cyan'
            )
            self.wait_for_transfer(client, config)
            return config
        params['agency_id'] = str(self.s.unwrap('agency_id'))
        params['advertiser_id'] = str(advertiser)
        params['include_removed_entities'] = False
        config = {
            'display_name': display_name,
            'destination_dataset_id': DataSets.raw.dataset_id,
            'data_source_id': SystemSettings.SERVICE_NAME,
            'schedule': 'every day {}'.format(
                datetime.strftime(datetime.now(), '%H:%M')
            ),
            'params': params,
            'disabled': False,
        }

        result = client.create_transfer_config(parent, config)
        cprint(
            'Created schedule for {}'.format(advertiser),
            'cyan',
            attrs=['bold']
        )
        self.wait_for_transfer(client, result)
        return result
Пример #10
0
def start_export(project: str, transfer_config_name: str, transfer_location: str,
                 base_date: datetime.date, backfill_day_count: int):
    """
    Start and wait for the completion of a backfill of `backfill_day_count` days, counting
    backwards from `base_date.  The base date is included in the backfill and counts as a
    day in the day count, i.e. `backfill_day_count` will backfill only .
    """
    if backfill_day_count <= 0:
        raise ValueError("Number of days to backfill must be at least 1")

    client = bigquery_datatransfer.DataTransferServiceClient()
    play_store_transfer_config = client.location_transfer_config_path(
        project, transfer_location, transfer_config_name
    )

    oldest_date = base_date - datetime.timedelta(days=backfill_day_count - 1)
    end_date = base_date

    logging.info(f"Backfilling {backfill_day_count} days: {oldest_date} to {base_date}")

    transfer_results = []
    # break backfills into BACKFILL_DAYS_MAX day segments
    while True:
        start_date = max(end_date - datetime.timedelta(days=BACKFILL_DAYS_MAX - 1), oldest_date)
        transfer_runs = trigger_backfill(start_date, end_date,
                                         play_store_transfer_config, client)
        transfer_run_names = [transfer_run.name for transfer_run
                              in sorted(transfer_runs, key=lambda run: run.schedule_time.seconds)]
        end_date = start_date - datetime.timedelta(days=1)

        # wait for backfill to complete
        # days in backfill are scheduled by the transfer service sequentially with 30s in between
        # starting from the latest date but can run in parallel
        new_results = map(wait_for_transfer, transfer_run_names)
        transfer_results.extend(new_results)

        if start_date == oldest_date:
            break
        elif start_date < oldest_date:
            raise ValueError("start_date should not be greater than oldest_date")

    successes = len([
        result for result in transfer_results
        if result == transfer_enums.TransferState.SUCCEEDED
    ])

    if len(transfer_results) != successes:
        raise DataTransferException(f"{len(transfer_results) - successes} failed dates")
Пример #11
0
def schedule_backfill(override_values={}):
    """
    Accepts a set of override values as a dictionary. This dictionary should have a 
    key "transfer_config_name" which can override the default transfer config
    """

    # [START bigquerydatatransfer_schedule_backfill]
    import datetime

    from google.cloud import bigquery_datatransfer

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd"
    # [END bigquerydatatransfer_schedule_backfill]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    transfer_config_name = override_values.get("transfer_config_name",
                                               transfer_config_name)
    # [START bigquerydatatransfer_schedule_backfill]
    now = datetime.datetime.now(datetime.timezone.utc)
    start_time = now - datetime.timedelta(days=5)
    end_time = now - datetime.timedelta(days=2)

    # Some data sources, such as scheduled_query only support daily run.
    # Truncate start_time and end_time to midnight time (00:00AM UTC).
    start_time = datetime.datetime(start_time.year,
                                   start_time.month,
                                   start_time.day,
                                   tzinfo=datetime.timezone.utc)
    end_time = datetime.datetime(end_time.year,
                                 end_time.month,
                                 end_time.day,
                                 tzinfo=datetime.timezone.utc)

    response = transfer_client.schedule_transfer_runs(
        parent=transfer_config_name,
        start_time=start_time,
        end_time=end_time,
    )

    print("Started transfer runs:")
    for run in response.runs:
        print(f"backfill: {run.run_time} run: {run.name}")
    # [END bigquerydatatransfer_schedule_backfill]
    return response.runs
    def scheduled_query(self):

        transfer_client = bigquery_datatransfer.DataTransferServiceClient()

        # The project where the query job runs is the same as the project
        # containing the destination dataset.
        project_id = "your-project-id"
        dataset_id = "your_dataset_id"

        # This service account will be used to execute the scheduled queries. Omit
        # this request parameter to run the query as the user with the credentials
        # associated with this client.
        service_account_name = "*****@*****.**"

        # Use standard SQL syntax for the query.
        query_string = """
        SELECT
        CURRENT_TIMESTAMP() as current_time,
        @run_time as intended_run_time,
        @run_date as intended_run_date,
        17 as some_integer
        """

        parent = transfer_client.common_project_path(project_id)

        transfer_config = bigquery_datatransfer.TransferConfig(
            destination_dataset_id=dataset_id,
            display_name="Your Scheduled Query Name",
            data_source_id="scheduled_query",
            params={
                "query": query_string,
                "destination_table_name_template": "your_table_{run_date}",
                "write_disposition": "WRITE_TRUNCATE",
                "partitioning_field": "",
            },
            schedule="every 24 hours",
        )

        transfer_config = transfer_client.create_transfer_config(
            bigquery_datatransfer.CreateTransferConfigRequest(
                parent=parent,
                transfer_config=transfer_config,
                service_account_name=service_account_name,
            ))

        print("Created scheduled query '{}'".format(transfer_config.name))
def list_configs(override_values={}):
    # [START bigquerydatatransfer_list_configs]
    from google.cloud import bigquery_datatransfer

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    project_id = "my-project"
    # [END bigquerydatatransfer_list_configs]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    project_id = override_values.get("project_id", project_id)
    # [START bigquerydatatransfer_list_configs]
    parent = transfer_client.common_project_path(project_id)

    configs = transfer_client.list_transfer_configs(parent=parent)
    print("Got the following configs:")
    for config in configs:
        print(f"\tID: {config.name}, Schedule: {config.schedule}")
Пример #14
0
def wait_for_transfer(transfer_name: str, timeout: int = 1200, polling_period: int = 20) -> int:
    """
    Continuously poll for run status to wait for completion
    """
    client = bigquery_datatransfer.DataTransferServiceClient()

    state = transfer_enums.TransferState.PENDING

    time_elapsed = 0
    while (state == transfer_enums.TransferState.PENDING or
           state == transfer_enums.TransferState.RUNNING):
        try:
            transfer_run = client.get_transfer_run(transfer_name)
        except GoogleAPICallError as e:
            # grpc errors are not serializable and cannot be raised in multiprocessing
            raise DataTransferException(f"Error getting transfer run: {e.message}")

        run_date = datetime.datetime.utcfromtimestamp(transfer_run.run_time.seconds).date()

        state = transfer_run.state

        if not (state == transfer_enums.TransferState.PENDING or
                state == transfer_enums.TransferState.RUNNING):
            break

        if time_elapsed >= timeout:
            logging.info(f"Transfer for {run_date} did not complete in {timeout} seconds")
            return -1
        time.sleep(polling_period)
        time_elapsed += polling_period

    if state == transfer_enums.TransferState.SUCCEEDED:
        result = "succeeded"
    elif state == transfer_enums.TransferState.CANCELLED:
        result = "cancelled"
    elif state == transfer_enums.TransferState.FAILED:
        result = "failed"
    else:
        result = "unspecified"

    logging.info(f"Transfer for {run_date} {result}")

    return state
Пример #15
0
def run_quickstart():
    # [START bigquerydatatransfer_quickstart]
    from google.cloud import bigquery_datatransfer

    client = bigquery_datatransfer.DataTransferServiceClient()

    project = 'my-project'  # TODO: Update to your project ID.

    # Get the full path to your project.
    parent = client.project_path(project)

    print('Supported Data Sources:')

    # Iterate over all possible data sources.
    for data_source in client.list_data_sources(parent):
        print('{}:'.format(data_source.display_name))
        print('\tID: {}'.format(data_source.data_source_id))
        print('\tFull path: {}'.format(data_source.name))
        print('\tDescription: {}'.format(data_source.description))
def delete_config(override_values={}):
    # [START bigquerydatatransfer_delete_transfer]
    import google.api_core.exceptions
    from google.cloud import bigquery_datatransfer

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd"
    # [END bigquerydatatransfer_delete_transfer]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    transfer_config_name = override_values.get("transfer_config_name",
                                               transfer_config_name)
    # [START bigquerydatatransfer_delete_transfer]
    try:
        transfer_client.delete_transfer_config(name=transfer_config_name)
    except google.api_core.exceptions.NotFound:
        print("Transfer config not found.")
    else:
        print(f"Deleted transfer config: {transfer_config_name}")
Пример #17
0
def sample_update_transfer_config(config_name, display_name):
    # [START bigquerydatatransfer_update_config]
    from google.cloud import bigquery_datatransfer

    client = bigquery_datatransfer.DataTransferServiceClient()
    # TODO(developer): Set the config_name which user wants to update.
    # config_name = "your-created-transfer-config-name"

    # TODO(developer): Set the display_name of transfer_config.
    # config_name = "your-created-transfer-config-name"

    transfer_config = client.get_transfer_config(name=config_name)
    transfer_config.display_name = display_name
    field_mask = {"paths": ["display_name"]}
    response = client.update_transfer_config(transfer_config=transfer_config,
                                             update_mask=field_mask)

    print("Transfer config updated for '{}'".format(response.name))
    # [END bigquerydatatransfer_update_config]
    # Return the config name for testing purposes, so that it can be deleted.
    return response
Пример #18
0
def cancel_active_transfers(project: str, transfer_config_name: str,
                            transfer_location: str):
    client = bigquery_datatransfer.DataTransferServiceClient()
    play_store_transfer_config = client.location_transfer_config_path(
        project, transfer_location, transfer_config_name)

    transfer_run_iter = client.list_transfer_runs(
        play_store_transfer_config,
        states=[
            transfer_enums.TransferState.PENDING,
            transfer_enums.TransferState.RUNNING
        ])
    delete_count = 1
    for page in transfer_run_iter.pages:
        for transfer_run in page:
            if delete_count % 20 == 0:  # pause to avoid hitting api rate limit
                time.sleep(3)
            run_date = datetime.datetime.utcfromtimestamp(
                transfer_run.run_time.seconds).date()
            client.delete_transfer_run(transfer_run.name)
            delete_count += 1
            print(f"Cancelled {transfer_run.name} for date {run_date}")
def copy_dataset(override_values={}):
    # [START bigquerydatatransfer_copy_dataset]
    from google.cloud import bigquery_datatransfer

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    destination_project_id = "my-destination-project"
    destination_dataset_id = "my_destination_dataset"
    source_project_id = "my-source-project"
    source_dataset_id = "my_source_dataset"
    # [END bigquerydatatransfer_copy_dataset]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    destination_project_id = override_values.get("destination_project_id",
                                                 destination_project_id)
    destination_dataset_id = override_values.get("destination_dataset_id",
                                                 destination_dataset_id)
    source_project_id = override_values.get("source_project_id",
                                            source_project_id)
    source_dataset_id = override_values.get("source_dataset_id",
                                            source_dataset_id)
    # [START bigquerydatatransfer_copy_dataset]
    transfer_config = bigquery_datatransfer.TransferConfig(
        destination_dataset_id=destination_dataset_id,
        display_name="Your Dataset Copy Name",
        data_source_id="cross_region_copy",
        params={
            "source_project_id": source_project_id,
            "source_dataset_id": source_dataset_id,
        },
        schedule="every 24 hours",
    )
    transfer_config = transfer_client.create_transfer_config(
        parent=transfer_client.common_project_path(destination_project_id),
        transfer_config=transfer_config,
    )
    print(f"Created transfer config: {transfer_config.name}")
    # [END bigquerydatatransfer_copy_dataset]
    return transfer_config
def create_scheduled_query(override_values={}):
    # [START bigquerydatatransfer_create_scheduled_query]
    # [START bigquerydatatransfer_create_scheduled_query_with_service_account]
    from google.cloud import bigquery_datatransfer

    transfer_client = bigquery_datatransfer.DataTransferServiceClient()

    # The project where the query job runs is the same as the project
    # containing the destination dataset.
    project_id = "your-project-id"
    dataset_id = "your_dataset_id"

    # This service account will be used to execute the scheduled queries. Omit
    # this request parameter to run the query as the user with the credentials
    # associated with this client.
    service_account_name = "*****@*****.**"
    # [END bigquerydatatransfer_create_scheduled_query_with_service_account]
    # [END bigquerydatatransfer_create_scheduled_query]
    # To facilitate testing, we replace values with alternatives
    # provided by the testing harness.
    project_id = override_values.get("project_id", project_id)
    dataset_id = override_values.get("dataset_id", dataset_id)
    service_account_name = override_values.get("service_account_name",
                                               service_account_name)
    # [START bigquerydatatransfer_create_scheduled_query]
    # [START bigquerydatatransfer_create_scheduled_query_with_service_account]

    # Use standard SQL syntax for the query.
    query_string = """
    SELECT
      CURRENT_TIMESTAMP() as current_time,
      @run_time as intended_run_time,
      @run_date as intended_run_date,
      17 as some_integer
    """

    parent = transfer_client.common_project_path(project_id)

    transfer_config = bigquery_datatransfer.TransferConfig(
        destination_dataset_id=dataset_id,
        display_name="Your Scheduled Query Name",
        data_source_id="scheduled_query",
        params={
            "query": query_string,
            "destination_table_name_template": "your_table_{run_date}",
            "write_disposition": "WRITE_TRUNCATE",
            "partitioning_field": "",
        },
        schedule="every 24 hours",
    )

    transfer_config = transfer_client.create_transfer_config(
        bigquery_datatransfer.CreateTransferConfigRequest(
            parent=parent,
            transfer_config=transfer_config,
            service_account_name=service_account_name,
        ))

    print("Created scheduled query '{}'".format(transfer_config.name))
    # [END bigquerydatatransfer_create_scheduled_query_with_service_account]
    # [END bigquerydatatransfer_create_scheduled_query]
    # Return the config name for testing purposes, so that it can be deleted.
    return transfer_config
Пример #21
0
def sample_create_transfer_config(project_id,
                                  dataset_id,
                                  authorization_code=""):
    # [START bigquerydatatransfer_create_scheduled_query]
    from google.cloud import bigquery_datatransfer

    client = bigquery_datatransfer.DataTransferServiceClient()

    # TODO(developer): Set the project_id to the project that contains the
    #                  destination dataset.
    # project_id = "your-project-id"

    # TODO(developer): Set the destination dataset. The authorized user must
    #                  have owner permissions on the dataset.
    # dataset_id = "your_dataset_id"

    # TODO(developer): The first time you run this sample, set the
    # authorization code to a value from the URL:
    # https://www.gstatic.com/bigquerydatatransfer/oauthz/auth?client_id=433065040935-hav5fqnc9p9cht3rqneus9115ias2kn1.apps.googleusercontent.com&scope=https://www.googleapis.com/auth/bigquery%20https://www.googleapis.com/auth/drive&redirect_uri=urn:ietf:wg:oauth:2.0:oob
    #
    # authorization_code = "_4/ABCD-EFGHIJKLMNOP-QRSTUVWXYZ"
    #
    # You can use an empty string for authorization_code in subsequent runs of
    # this code sample with the same credentials.
    #
    # authorization_code = ""

    # Use standard SQL syntax for the query.
    query_string = """
    SELECT
      CURRENT_TIMESTAMP() as current_time,
      @run_time as intended_run_time,
      @run_date as intended_run_date,
      17 as some_integer
    """

    parent = f"projects/{project_id}"

    transfer_config = bigquery_datatransfer.TransferConfig(
        destination_dataset_id=dataset_id,
        display_name="Your Scheduled Query Name",
        data_source_id="scheduled_query",
        params={
            "query": query_string,
            "destination_table_name_template": "your_table_{run_date}",
            "write_disposition": "WRITE_TRUNCATE",
            "partitioning_field": "",
        },
        schedule="every 24 hours",
    )

    response = client.create_transfer_config(
        request={
            "parent": parent,
            "transfer_config": transfer_config,
            "authorization_code": authorization_code,
        })

    print("Created scheduled query '{}'".format(response.name))
    # [END bigquerydatatransfer_create_scheduled_query]
    # Return the config name for testing purposes, so that it can be deleted.
    return response.name
Пример #22
0
# Modified version of the python example for "scheduled query"
# https://cloud.google.com/bigquery/docs/scheduling-queries#setting_up_a_scheduled_query
#
# Current cli user must be authenticated and have the propper authrorization to BQDT

from google.cloud import bigquery_datatransfer

client = bigquery_datatransfer.DataTransferServiceClient()

# TODO: Replace variables before attempting
project = ''
source_dataset_id = ''
destination_dataset_id = ''
display_name = 'Python Data Transfer final'

parent = f"projects/{project}"

transfer_config = bigquery_datatransfer.TransferConfig(
    destination_dataset_id=destination_dataset_id,
    display_name=display_name,
    data_source_id="cross_region_copy",
    params={
        "source_dataset_id": source_dataset_id,
        "source_project_id": project,
        "overwrite_destination_table": "true"
    },
)

response = client.create_transfer_config(
    request={
        "parent": parent,
Пример #23
0
from google.cloud import bigquery, bigquery_datatransfer
from google.oauth2 import service_account
import datetime
import functions as f
import os

# get credentials using the client_secrets.json which contains my service account details
credentials = service_account.Credentials.from_service_account_file(
    'client_secrets.json',
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
client = bigquery.Client(
    credentials=credentials,
    project=credentials.project_id,
)
client_transfer = bigquery_datatransfer.DataTransferServiceClient(
    credentials=credentials)

# sql query to create a view in the gold dataset
sql = """
SELECT 
warehouse_name,
warehouse_id,
country,
product_id,
inventory_quantity,
reporting_month
 FROM `zapp-case-study.silver_warehouse.warehouse_product_overview` wpo
"""

# This view will live in the gold Dataset
dataset = 'gold_warehouse'
def bqdts_client(credentials):
    from google.cloud import bigquery_datatransfer

    return bigquery_datatransfer.DataTransferServiceClient(
        credentials=credentials)
Пример #25
0
    def execute(self, data):

        # setup 
        client = self.client
        bigquery = self.bigquery
        bigquery_datatransfer = self.bigquery_datatransfer
        datetime = self.datetime 
        pytz = self.pytz        
        time = self.time 
        name = data.get("titleName")
        emails = data.get("emails")
        query = data.get("query")
        table = ""
        #

        # create a dataset first if needed
        dataset_main = self.make_dataset()
        table_id = "{}.{}".format(dataset_main, name) 
        #    

        #create a table if needed
        table= self.make_table(table_id)
        #
                

        # an interactive query
        if(self.env.get("interactive")):
            try:
                 
                """
                    query =
                    SELECT name, SUM(number) as total_people
                    FROM `bigquery-public-data.usa_names.usa_1910_2013`
                    WHERE state = 'TX'
                    GROUP BY name, state
                    ORDER BY total_people DESC
                    LIMIT 20
                """
                query_job = client.query(query)  # Make an API request.

                
                return json.dumps({
                    "schema":[{"field":x} for x in ["Name","Count"]],
                    "data":[
                        # Row values can be accessed by field name or index.
                        {
                            "Name":row[0],
                            "Count":row["total_people"] 
                        }
                        for row in query_job
                    ]
                })
                    
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # a batch query
        elif(self.env.get("batch")):
            try:
                job_config = bigquery.QueryJobConfig(
                    # Run at batch priority, which won't count toward concurrent rate limit.
                    priority=bigquery.QueryPriority.BATCH
                )

                """
                    query =
                    SELECT corpus
                    FROM `bigquery-public-data.samples.shakespeare`
                    GROUP BY corpus;
                """

                # Start the query, passing in the extra configuration.
                query_job = client.query(query, job_config=job_config)  # Make an API request.

                query_job = client.get_job(
                    query_job.job_id, location=query_job.location
                )  # Make an API request.

                return "Job {} is currently in state {}".format(query_job.job_id, query_job.state)             
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'       
        # 

        # a dry run
        elif (self.env.get("dry_run")):
            try:
                """ query
                    
                    SELECT name, COUNT(*) as name_count 
                    FROM `bigquery-public-data.usa_names.usa_1910_2013` 
                    WHERE state = 'WA' 
                    GROUP BY name
                """                
                job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)

                # Start the query, passing in the extra configuration.
                query_job = client.query(
                    query,
                    job_config=job_config,
                )  # Make an API request.

                # A dry run query completes immediately.
                return "This query will process {} megabytes.".format(query_job.total_bytes_processed/1000000)                
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # write to a  permanent table
        elif(self.env.get("permanent_table")):
            try:
                 
                """ query
                SELECT corpus
                FROM `bigquery-public-data.samples.shakespeare`
                GROUP BY corpus;
                """                
                job_config = bigquery.QueryJobConfig(destination=table_id)
                # Start the query, passing in the extra configuration.
                query_job = client.query(query, job_config=job_config)  # Make an API request.

                # fr queries that return massive results using legacy SQL
                # job_config = bigquery.QueryJobConfig(
                #     allow_large_results=True, destination=table_id, use_legacy_sql=True
                # )
                
                query_job.result()  # Wait for the job to complete.

                return "Query results loaded to the table {}".format(table_id)           
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # query caching
        elif (self.env.get("cache_enable/disable")):
            try:
                """ query

                    SELECT corpus
                    FROM `bigquery-public-data.samples.shakespeare`
                    GROUP BY corpus;
                """                
                job_config = bigquery.QueryJobConfig(
                    use_query_cache=False # to disable
                    # createDisposition="CREATE_NEVER"  # to enable
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.
                query_job.result()
                return "was this query cached? {}".format(query_job.cache_hit)
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # regular parameterized 
        elif(self.env.get("parameterized")):
            try:
                """ query = 
                 
                     SELECT word, word_count
                     FROM `bigquery-public-data.samples.shakespeare`
                     WHERE corpus = @corpus
                     AND word_count >= @min_word_count
                     ORDER BY word_count DESC;
                 """
                job_config = bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.ScalarQueryParameter("corpus", "STRING", "romeoandjuliet"),
                        bigquery.ScalarQueryParameter("min_word_count", "INT64", 250),
                    ]
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.

                return json.dumps({
                    "schema":[{"field":x} for x in ["word","word_count"]],
                    "data":[
                        # Row values can be accessed by field name or index.
                        {
                            "word":row["word"],
                            "word_count":row["word_count"] 
                        }
                        for row in query_job
                    ]
                })


            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        # array parameterized
        elif(self.env.get("array_parameterized")):
            try:
                schema = ["name","count"]
                """query = 
                    
                    SELECT name, sum(number) as count
                    FROM `bigquery-public-data.usa_names.usa_1910_2013`
                    WHERE gender = @gender
                    AND state IN UNNEST(@states)
                    GROUP BY name
                    ORDER BY count DESC
                    LIMIT 10;
                 """
                job_config = bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.ScalarQueryParameter("gender", "STRING", "M"),
                        bigquery.ArrayQueryParameter("states", "STRING", ["WA", "WI", "WV", "WY"]),
                    ]
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.

                return json.dumps({
                    "schema":[{"field":x} for x in schema],
                    "data":[
                        # Row values can be accessed by field name or index.
                        {
                            schema[0]:row[schema[0]],
                            schema[1]:row[schema[1]] 
                        }
                        for row in query_job
                    ]
                })
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        # timestamp parameters
        elif(self.env.get("timestamp_parameterized")):
            try:
                schema = ["date","f0_"]
                """query = 
                    
                    SELECT TIMESTAMP_ADD(@ts_value, INTERVAL 1 HOUR);
                 """
                job_config = bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.ScalarQueryParameter(
                            "ts_value",
                            "TIMESTAMP",
                            datetime.datetime(2016, 12, 7, 8, 0, tzinfo=pytz.UTC),
                        )
                    ]
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.    
                return json.dumps(
                    {
                        "schema":[{"field":x} for x in schema],
                        "data":[
                            # Row values can be accessed by field name or index.
                            {
                                "date":row[0].strftime("%m/%d/%Y, %H:%M:%S"),
                                schema[1]:row[schema[1]] 
                            }
                            for row in query_job
                        ]
                    },
                    default=str
                )
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        # struct parameters
        elif(self.env.get("struct_parameterized")):
            
            try:
                schema = ["s"]
                """ query = 
                SELECT @struct_value AS s;
                """
                job_config = bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.StructQueryParameter(
                            "struct_value",
                            bigquery.ScalarQueryParameter("x", "INT64", 1),
                            bigquery.ScalarQueryParameter("y", "STRING", "foo"),
                        )
                    ]
                )
                query_job = client.query(query, job_config=job_config)  # Make an API request.
                return json.dumps({
                    "schema":[{"field":x} for x in schema],
                    "data":[
                        # Row values can be accessed by field name or index.
                        {
                            schema[0]:str(row[schema[0]]),
                        }
                        for row in query_job
                    ]
                })           
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # wildcard
        elif (self.env.get("mutliple_by_wildcard")):
            try:
                job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
                # replace the asterick with a number or change the table suffixes to see the amount of data used
                """ query
                
                SELECT
                max,
                ROUND((max-32)*5/9,1) celsius,
                mo,
                da,
                year
                FROM
                `bigquery-public-data.noaa_gsod.gsod194*` 
                WHERE
                max != 9999.9 # code for missing data
                AND ( _TABLE_SUFFIX = '0'
                    OR _TABLE_SUFFIX = '4' )
                ORDER BY
                max DESC 
                """
                # Start the query, passing in the extra configuration.
                query_job = client.query(
                    query
                    ,
                    job_config=job_config,
                )  # Make an API request.

                # A dry run query completes immediately.
                return "This query will process {} megabytes.".format(query_job.total_bytes_processed/1000000)                
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        # 
                
        # scheduled query
            # using bigquery datatransfer, the names are not joined as much
        elif(self.env.get("scheduled")):
            try:
                # Use standard SQL syntax for the query.
                """ query = 
                
                SELECT
                CURRENT_TIMESTAMP() as current_time,
                @run_time as intended_run_time,
                @run_date as intended_run_date,
                17 as some_integer
                """

                transfer_client = bigquery_datatransfer.DataTransferServiceClient()
                
                project_id = client.project
                dataset_id = self.dataset_names[0]
                # This service account will be used to execute the scheduled queries. Omit
                # this request parameter to run the query as the user with the credentials
                # associated with this client. remember roles and permissions
                # service_account_name = "*****@*****.**"


                parent = transfer_client.common_project_path(project_id)

                transfer_config = bigquery_datatransfer.TransferConfig(
                    destination_dataset_id=dataset_id,
                    display_name="Your Scheduled Query Name",
                    data_source_id="scheduled_query",
                    params={
                        "query": query,
                        "destination_table_name_template": "your_table_{run_date}",
                        "write_disposition": "WRITE_APPEND",
                        "partitioning_field": "",
                    },
                    schedule="every 24 hours",
                )

                transfer_config = transfer_client.create_transfer_config(
                    bigquery_datatransfer.CreateTransferConfigRequest(
                        parent=parent,
                        transfer_config=transfer_config,
                        # service_account_name=service_account_name,
                    )
                )

                return "Created scheduled query '{}'".format(transfer_config.name)       
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #        

        # check schedule status
            # you see them ins scheduled queries
        elif(self.env.get("scheduled_status")):
            try:
                transfer_client = bigquery_datatransfer.DataTransferServiceClient()
                project_id = client.project
                parent = transfer_client.common_project_path(project_id)

                configs = transfer_client.list_transfer_configs(parent=parent)
                result = "Got the following configs:"
                for config in configs:
                    result += f"\n\tID: {config.name}, Schedule: {config.schedule}"      
                return result   
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        # manual runs
            # used to immediatley execute a query
        elif (self.env.get("manual_run")):
            try:
                transfer_client = bigquery_datatransfer.DataTransferServiceClient()
                transfer_config_name = query
                now = datetime.datetime.now(datetime.timezone.utc)
                start_time = now - datetime.timedelta(days=5)
                end_time = now - datetime.timedelta(days=2)

                # Some data sources, such as scheduled_query only support daily run.
                # Truncate start_time and end_time to midnight time (00:00AM UTC).
                start_time = datetime.datetime(
                    start_time.year, start_time.month, start_time.day, tzinfo=datetime.timezone.utc
                )
                end_time = datetime.datetime(
                    end_time.year, end_time.month, end_time.day, tzinfo=datetime.timezone.utc
                )

                response = transfer_client.schedule_transfer_runs(
                    parent=transfer_config_name,
                    start_time=start_time,
                    end_time=end_time,
                )

                result = "Started transfer runs:"
                for run in response.runs:
                    result += f"\n\tbackfill: {run.run_time} run: {run.name}"    
                return result        
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        # 

        # delete scheduled query
        elif(self.env.get("scheduled_delete")):
            try:
                import google.api_core.exceptions
                transfer_client = bigquery_datatransfer.DataTransferServiceClient()
                transfer_config_name = query
                transfer_client.delete_transfer_config(name=transfer_config_name)
                return f"Deleted transfer config: {transfer_config_name}"
            except google.api_core.exceptions.NotFound:
                return "Transfer config not found."                            
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'            
        #

        return "Check the backend env dictionary you did set it so the backend didnt do anything"
Пример #26
0
    def execute(self, name):
        # create a dataset
        if (self.env.get("create")):
            dataset_id = self.make_dataset_id(name)
            dataset_init = bigquery.Dataset(dataset_id)
            dataset_init.location = "US"  # multi-region
            try:
                if (name == ""):
                    raise IndexError
                dataset = client.create_dataset(dataset_init, timeout=30)
                return "Created dataset {}.{}".format(client.project,
                                                      dataset.dataset_id)
            except IndexError:
                return "Please input a name"
            except BaseException as e:
                print("\nlook here\n")
                print(e)
                return "Dataset already exists choose another name"
        #

        # copy a dataset
        elif (self.env.get("copy")):
            transfer_client = bigquery_datatransfer.DataTransferServiceClient()

            # create a source dataset
            source_project_id = client.project
            source_dataset_id = "my_source_dataset"
            dataset_id = self.make_dataset_id(source_dataset_id)
            dataset_init = bigquery.Dataset(dataset_id)
            try:
                dataset = client.create_dataset(dataset_init, timeout=30)
            except BaseException as e:
                print("")  #no-op
            #

            try:
                destination_project_id = client.project
                destination_dataset_id = name
                if (name == ""):
                    raise IndexError
                transfer_config = bigquery_datatransfer.TransferConfig(
                    destination_dataset_id=destination_dataset_id,
                    display_name="Your Dataset Copy Name",
                    data_source_id="cross_region_copy",
                    params={
                        "source_project_id": source_project_id,
                        "source_dataset_id": source_dataset_id,
                    },
                    # schedule="from ",
                )
                transfer_config = transfer_client.create_transfer_config(
                    parent=transfer_client.common_project_path(
                        destination_project_id),
                    transfer_config=transfer_config,
                )
                return "Created transfer config: {}".format(
                    transfer_config.name)
            except BaseException as e:
                print("\nlook here\n")
                print(e)
                return "Was not able to make a copy mabye check the terminal where the python server is running"
        #

        # access control
        elif (self.env.get("access_control")):
            try:
                dataset_id = self.make_dataset_id(name)
                dataset = client.get_dataset(dataset_id)
                entry = bigquery.AccessEntry(
                    role="READER",
                    entity_type="userByEmail",
                    entity_id="*****@*****.**",
                )

                entries = list(dataset.access_entries)
                entries.append(entry)
                dataset.access_entries = entries
                dataset = client.update_dataset(dataset, ["access_entries"])
                full_dataset_id = "{}.{}".format(dataset.project,
                                                 dataset.dataset_id)
                return "Updated dataset '{}' with modified user permissions.".format(
                    full_dataset_id)
            except BaseException as e:
                print("\nlook here\n")
                print(e)
                return "An error occured"
        #

        # list all datasets
        elif (self.env.get("list")):
            try:
                datasets = list(client.list_datasets())
                project = client.project
                value = "datasets in project {}".format(project)
                if datasets:
                    for dataset in datasets:
                        value += "\n{}".format(dataset.dataset_id)
                    return value
                else:
                    return "{} project does not contain any datasets.".format(
                        project)
            except BaseException as e:
                print("\nlook here\n")
                print(e)
                return "An error occured"
        #

        # metadata
        elif (self.env.get("metadata")):
            value = "Metadata:\n"
            try:
                # get the dataset friendly name
                dataset_id = self.make_dataset_id(name)
                dataset = client.get_dataset(dataset_id)
                full_dataset_id = "{}.{}".format(dataset.project,
                                                 dataset.dataset_id)
                value += "{}: {}\n".format("Friendly Name",
                                           dataset.friendly_name)
                #

                # more properties
                value += "{}: {}\n".format("Description", dataset.description)
                value += "Labels:\n"
                labels = dataset.labels
                if labels:
                    for label, value in labels.items():
                        value += "\n{}: {}".format(label, value)
                else:
                    value += "\tDataset has no labels defined.\n"
                #

                # View tables
                value += "Tables:"
                tables = list(client.list_tables(dataset))
                if tables:
                    for table in tables:
                        value += "\n{}:".format(table.table_id)
                else:
                    value += "\tDataset does not have tables.\n"
                #

                return value

            except BaseException as e:
                print("\nlook here\n")
                print(e)
                return "An error occured"
        #

        # update dataset
        elif (self.env.get("update")):

            value = ""
            try:
                # update dataset description
                dataset_id = self.make_dataset_id(name)
                dataset = client.get_dataset(
                    dataset_id)  # Make an API request.
                print(dataset)
                dataset.description = "Updated description."
                dataset = client.update_dataset(
                    dataset, ["description"])  # Make an API request.

                full_dataset_id = "{}.{}".format(dataset.project,
                                                 dataset.dataset_id)
                value += "Updated dataset '{}' with description '{}'.\n".format(
                    full_dataset_id, dataset.description)
                #

                # update default table expiration
                dataset.default_table_expiration_ms = 24 * 60 * 60 * 1000  # In milliseconds means in 24 hours
                dataset = client.update_dataset(
                    dataset, ["default_table_expiration_ms"])
                value += "Updated dataset '{}' with table expiration '{}'.\n".format(
                    full_dataset_id, dataset.default_table_expiration_ms)
                #

                # update default partition expiration
                dataset.default_partition_expiration_ms = 24 * 60 * 60 * 1000  # In milliseconds means in 24 hours
                dataset = client.update_dataset(
                    dataset, ["default_partition_expiration_ms"])
                value += "Updated dataset '{}' with table partition expiration '{}'.\n".format(
                    full_dataset_id, dataset.default_partition_expiration_ms)
                #
                return value

            except IndexError:
                return "Provide the dataset name"
            except BaseException as e:
                print("\nlook here\n")
                print(e)
                return "An error occured"
        #

        # delete dataset
        elif (self.env.get("delete")):
            try:
                dataset_id = self.make_dataset_id(name)
                client.delete_dataset(dataset_id,
                                      delete_contents=True,
                                      not_found_ok=False)

                return "Deleted dataset '{}'.".format(dataset_id)
            except IndexError:
                return "Provide the dataset name"
            except BaseException as e:
                print("\nlook here\n")
                print(e.__class__.__name__)
                if (e.__class__.__name__ == "NotFound"):
                    return "Could not find dataset with name {} to delete".format(
                        dataset_id)
                return "An error occured"