def run_quickstart(override_values={}): # [START bigquerydatatransfer_quickstart] from google.cloud import bigquery_datatransfer client = bigquery_datatransfer.DataTransferServiceClient() # TODO: Update to your project ID. project_id = "my-project" # [END bigquerydatatransfer_quickstart] # To facilitate testing, we replace values with alternatives # provided by the testing harness. project_id = override_values.get("project_id", project_id) # [START bigquerydatatransfer_quickstart] # Get the full path to your project. parent = client.common_project_path(project_id) print("Supported Data Sources:") # Iterate over all possible data sources. for data_source in client.list_data_sources(parent=parent): print("{}:".format(data_source.display_name)) print("\tID: {}".format(data_source.data_source_id)) print("\tFull path: {}".format(data_source.name)) print("\tDescription: {}".format(data_source.description))
def scheduled_query(transfer_client, dataset_id, table_id, query_string, schedule): transfer_client = bigquery_datatransfer.DataTransferServiceClient() # The project where the query job runs is the same as the project # containing the destination dataset. project_id = transfer_client.project parent = transfer_client.common_project_path(project_id) transfer_config = bigquery_datatransfer.TransferConfig( destination_dataset_id=dataset_id, display_name="Your Scheduled Query Name", data_source_id="scheduled_query", params={ "query": query_string, "destination_table_name_template": table_id, "write_disposition": "WRITE_APPEND", "partitioning_field": "", }, schedule=schedule) transfer_config = transfer_client.create_transfer_config( bigquery_datatransfer.CreateTransferConfigRequest( parent=parent, transfer_config=transfer_config, )) print("Created scheduled query '{}'".format(transfer_config.name))
def update_config(override_values={}): # [START bigquerydatatransfer_update_config] from google.cloud import bigquery_datatransfer from google.protobuf import field_mask_pb2 transfer_client = bigquery_datatransfer.DataTransferServiceClient() transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd" new_display_name = "My Transfer Config" # [END bigquerydatatransfer_update_config] # To facilitate testing, we replace values with alternatives # provided by the testing harness. new_display_name = override_values.get("new_display_name", new_display_name) transfer_config_name = override_values.get("transfer_config_name", transfer_config_name) # [START bigquerydatatransfer_update_config] transfer_config = bigquery_datatransfer.TransferConfig( name=transfer_config_name) transfer_config.display_name = new_display_name transfer_config = transfer_client.update_transfer_config({ "transfer_config": transfer_config, "update_mask": field_mask_pb2.FieldMask(paths=["display_name"]), }) print(f"Updated config: '{transfer_config.name}'") print(f"New display name: '{transfer_config.display_name}'") # [END bigquerydatatransfer_update_config] # Return the config name for testing purposes, so that it can be deleted. return transfer_config
def run_notification(transfer_config_name, pubsub_topic): orig_transfer_config_name = transfer_config_name orig_pubsub_topic = pubsub_topic # [START bigquerydatatransfer_run_notification] transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd" pubsub_topic = "projects/PROJECT-ID/topics/TOPIC-ID" # [END bigquerydatatransfer_run_notification] transfer_config_name = orig_transfer_config_name pubsub_topic = orig_pubsub_topic # [START bigquerydatatransfer_run_notification] from google.cloud import bigquery_datatransfer from google.protobuf import field_mask_pb2 transfer_client = bigquery_datatransfer.DataTransferServiceClient() transfer_config = bigquery_datatransfer.TransferConfig( name=transfer_config_name) transfer_config.notification_pubsub_topic = pubsub_topic update_mask = field_mask_pb2.FieldMask(paths=["notification_pubsub_topic"]) transfer_config = transfer_client.update_transfer_config({ "transfer_config": transfer_config, "update_mask": update_mask }) print(f"Updated config: '{transfer_config.name}'") print( f"Notification Pub/Sub topic: '{transfer_config.notification_pubsub_topic}'" ) # [END bigquerydatatransfer_run_notification] # Return the config name for testing purposes, so that it can be deleted. return transfer_config
def update_credentials_with_service_account(override_values={}): # [START bigquerydatatransfer_update_credentials] from google.cloud import bigquery_datatransfer from google.protobuf import field_mask_pb2 transfer_client = bigquery_datatransfer.DataTransferServiceClient() service_account_name = "*****@*****.**" transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd" # [END bigquerydatatransfer_update_credentials] # To facilitate testing, we replace values with alternatives # provided by the testing harness. service_account_name = override_values.get("service_account_name", service_account_name) transfer_config_name = override_values.get("transfer_config_name", transfer_config_name) # [START bigquerydatatransfer_update_credentials] transfer_config = bigquery_datatransfer.TransferConfig( name=transfer_config_name) transfer_config = transfer_client.update_transfer_config({ "transfer_config": transfer_config, "update_mask": field_mask_pb2.FieldMask(paths=["service_account_name"]), "service_account_name": service_account_name, }) print("Updated config: '{}'".format(transfer_config.name)) # [END bigquerydatatransfer_update_credentials] # Return the config name for testing purposes, so that it can be deleted. return transfer_config
def test_list_data_sources(project_id): client = bigquery_datatransfer.DataTransferServiceClient() parent = client.common_project_path(project_id) data_sources = list(client.list_data_sources(parent=parent)) assert len(data_sources) >= 0
def ScheduledQueriesCheck(self, scheduled_queries_name, project_id='benq-data-etl'): ''' purpose : used for checking state of scheduled queries param scheduled_queries_name : display name of scheduled queries param project_id : id of project with default = benq-data-etl return dictionary : is_check_succeed => (boolean : required), is_query_execute_succeed => (boolean : required) ''' starting_time = datetime.now() is_check_succeed = False is_query_execute_succeed = False try: transfer_client = bigquery_datatransfer.DataTransferServiceClient() parent = transfer_client.common_project_path(project_id) configs = transfer_client.list_transfer_configs(parent=parent) config = next( filter(lambda x: x.display_name == scheduled_queries_name, configs), None) if config: is_query_execute_succeed = True if config.state.name == 'SUCCEEDED' else False is_check_succeed = True else: error_log = 'config of {} not found'.format( scheduled_queries_name) except Exception as E: error_log = str(E) ending_time = datetime.now() message = '{} to check state of scheduled queries: {}{} using time: {}'.format( 'Succeeded' if is_check_succeed else 'Failed', scheduled_queries_name, '' if is_check_succeed else ' due to {}'.format(error_log), ending_time - starting_time) print(message + '\n\n') self.PostMessage( self.slack_channel, message, '{}_Log_BigQuery'.format( 'Correct' if is_check_succeed else 'Error')) return { 'is_check_succeed': is_check_succeed, 'is_query_execute_succeed': is_query_execute_succeed }
def transfer_client(default_credentials, project_id): credentials, _ = default_credentials options = client_options.ClientOptions(quota_project_id=project_id) transfer_client = bigquery_datatransfer.DataTransferServiceClient( credentials=credentials, client_options=options) # Ensure quota is always attributed to the correct project. bigquery_datatransfer.DataTransferServiceClient = lambda: transfer_client return transfer_client
def load_transfers(self, cli: bigquery.Client, project: str, advertiser): """ Bootstrap step to create BigQuery data transfers. :param cli: BigQuery client instance :param project: Name of the project :param advertiser: Numeric value of an SA360 advertiser :return: The result of the client transfer configuration. """ client = bigquery_datatransfer.DataTransferServiceClient() location = self.s.unwrap('location') project = self.s.unwrap('gcp_project_name') data_source = 'doubleclick_search' name = client.location_data_source_path(project, location, data_source) parent = client.location_path(project, location) params = Struct() display_name= 'SA360 Transfer {}'.format(advertiser) config = Bootstrap.config_exists(client, parent, display_name) if config is not None: print('start transfer') cprint( 'Schedule already exists for {}. Skipping'.format(advertiser), 'cyan' ) self.wait_for_transfer(client, config) return config params['agency_id'] = str(self.s.unwrap('agency_id')) params['advertiser_id'] = str(advertiser) params['include_removed_entities'] = False config = { 'display_name': display_name, 'destination_dataset_id': DataSets.raw.dataset_id, 'data_source_id': SystemSettings.SERVICE_NAME, 'schedule': 'every day {}'.format( datetime.strftime(datetime.now(), '%H:%M') ), 'params': params, 'disabled': False, } result = client.create_transfer_config(parent, config) cprint( 'Created schedule for {}'.format(advertiser), 'cyan', attrs=['bold'] ) self.wait_for_transfer(client, result) return result
def start_export(project: str, transfer_config_name: str, transfer_location: str, base_date: datetime.date, backfill_day_count: int): """ Start and wait for the completion of a backfill of `backfill_day_count` days, counting backwards from `base_date. The base date is included in the backfill and counts as a day in the day count, i.e. `backfill_day_count` will backfill only . """ if backfill_day_count <= 0: raise ValueError("Number of days to backfill must be at least 1") client = bigquery_datatransfer.DataTransferServiceClient() play_store_transfer_config = client.location_transfer_config_path( project, transfer_location, transfer_config_name ) oldest_date = base_date - datetime.timedelta(days=backfill_day_count - 1) end_date = base_date logging.info(f"Backfilling {backfill_day_count} days: {oldest_date} to {base_date}") transfer_results = [] # break backfills into BACKFILL_DAYS_MAX day segments while True: start_date = max(end_date - datetime.timedelta(days=BACKFILL_DAYS_MAX - 1), oldest_date) transfer_runs = trigger_backfill(start_date, end_date, play_store_transfer_config, client) transfer_run_names = [transfer_run.name for transfer_run in sorted(transfer_runs, key=lambda run: run.schedule_time.seconds)] end_date = start_date - datetime.timedelta(days=1) # wait for backfill to complete # days in backfill are scheduled by the transfer service sequentially with 30s in between # starting from the latest date but can run in parallel new_results = map(wait_for_transfer, transfer_run_names) transfer_results.extend(new_results) if start_date == oldest_date: break elif start_date < oldest_date: raise ValueError("start_date should not be greater than oldest_date") successes = len([ result for result in transfer_results if result == transfer_enums.TransferState.SUCCEEDED ]) if len(transfer_results) != successes: raise DataTransferException(f"{len(transfer_results) - successes} failed dates")
def schedule_backfill(override_values={}): """ Accepts a set of override values as a dictionary. This dictionary should have a key "transfer_config_name" which can override the default transfer config """ # [START bigquerydatatransfer_schedule_backfill] import datetime from google.cloud import bigquery_datatransfer transfer_client = bigquery_datatransfer.DataTransferServiceClient() transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd" # [END bigquerydatatransfer_schedule_backfill] # To facilitate testing, we replace values with alternatives # provided by the testing harness. transfer_config_name = override_values.get("transfer_config_name", transfer_config_name) # [START bigquerydatatransfer_schedule_backfill] now = datetime.datetime.now(datetime.timezone.utc) start_time = now - datetime.timedelta(days=5) end_time = now - datetime.timedelta(days=2) # Some data sources, such as scheduled_query only support daily run. # Truncate start_time and end_time to midnight time (00:00AM UTC). start_time = datetime.datetime(start_time.year, start_time.month, start_time.day, tzinfo=datetime.timezone.utc) end_time = datetime.datetime(end_time.year, end_time.month, end_time.day, tzinfo=datetime.timezone.utc) response = transfer_client.schedule_transfer_runs( parent=transfer_config_name, start_time=start_time, end_time=end_time, ) print("Started transfer runs:") for run in response.runs: print(f"backfill: {run.run_time} run: {run.name}") # [END bigquerydatatransfer_schedule_backfill] return response.runs
def scheduled_query(self): transfer_client = bigquery_datatransfer.DataTransferServiceClient() # The project where the query job runs is the same as the project # containing the destination dataset. project_id = "your-project-id" dataset_id = "your_dataset_id" # This service account will be used to execute the scheduled queries. Omit # this request parameter to run the query as the user with the credentials # associated with this client. service_account_name = "*****@*****.**" # Use standard SQL syntax for the query. query_string = """ SELECT CURRENT_TIMESTAMP() as current_time, @run_time as intended_run_time, @run_date as intended_run_date, 17 as some_integer """ parent = transfer_client.common_project_path(project_id) transfer_config = bigquery_datatransfer.TransferConfig( destination_dataset_id=dataset_id, display_name="Your Scheduled Query Name", data_source_id="scheduled_query", params={ "query": query_string, "destination_table_name_template": "your_table_{run_date}", "write_disposition": "WRITE_TRUNCATE", "partitioning_field": "", }, schedule="every 24 hours", ) transfer_config = transfer_client.create_transfer_config( bigquery_datatransfer.CreateTransferConfigRequest( parent=parent, transfer_config=transfer_config, service_account_name=service_account_name, )) print("Created scheduled query '{}'".format(transfer_config.name))
def list_configs(override_values={}): # [START bigquerydatatransfer_list_configs] from google.cloud import bigquery_datatransfer transfer_client = bigquery_datatransfer.DataTransferServiceClient() project_id = "my-project" # [END bigquerydatatransfer_list_configs] # To facilitate testing, we replace values with alternatives # provided by the testing harness. project_id = override_values.get("project_id", project_id) # [START bigquerydatatransfer_list_configs] parent = transfer_client.common_project_path(project_id) configs = transfer_client.list_transfer_configs(parent=parent) print("Got the following configs:") for config in configs: print(f"\tID: {config.name}, Schedule: {config.schedule}")
def wait_for_transfer(transfer_name: str, timeout: int = 1200, polling_period: int = 20) -> int: """ Continuously poll for run status to wait for completion """ client = bigquery_datatransfer.DataTransferServiceClient() state = transfer_enums.TransferState.PENDING time_elapsed = 0 while (state == transfer_enums.TransferState.PENDING or state == transfer_enums.TransferState.RUNNING): try: transfer_run = client.get_transfer_run(transfer_name) except GoogleAPICallError as e: # grpc errors are not serializable and cannot be raised in multiprocessing raise DataTransferException(f"Error getting transfer run: {e.message}") run_date = datetime.datetime.utcfromtimestamp(transfer_run.run_time.seconds).date() state = transfer_run.state if not (state == transfer_enums.TransferState.PENDING or state == transfer_enums.TransferState.RUNNING): break if time_elapsed >= timeout: logging.info(f"Transfer for {run_date} did not complete in {timeout} seconds") return -1 time.sleep(polling_period) time_elapsed += polling_period if state == transfer_enums.TransferState.SUCCEEDED: result = "succeeded" elif state == transfer_enums.TransferState.CANCELLED: result = "cancelled" elif state == transfer_enums.TransferState.FAILED: result = "failed" else: result = "unspecified" logging.info(f"Transfer for {run_date} {result}") return state
def run_quickstart(): # [START bigquerydatatransfer_quickstart] from google.cloud import bigquery_datatransfer client = bigquery_datatransfer.DataTransferServiceClient() project = 'my-project' # TODO: Update to your project ID. # Get the full path to your project. parent = client.project_path(project) print('Supported Data Sources:') # Iterate over all possible data sources. for data_source in client.list_data_sources(parent): print('{}:'.format(data_source.display_name)) print('\tID: {}'.format(data_source.data_source_id)) print('\tFull path: {}'.format(data_source.name)) print('\tDescription: {}'.format(data_source.description))
def delete_config(override_values={}): # [START bigquerydatatransfer_delete_transfer] import google.api_core.exceptions from google.cloud import bigquery_datatransfer transfer_client = bigquery_datatransfer.DataTransferServiceClient() transfer_config_name = "projects/1234/locations/us/transferConfigs/abcd" # [END bigquerydatatransfer_delete_transfer] # To facilitate testing, we replace values with alternatives # provided by the testing harness. transfer_config_name = override_values.get("transfer_config_name", transfer_config_name) # [START bigquerydatatransfer_delete_transfer] try: transfer_client.delete_transfer_config(name=transfer_config_name) except google.api_core.exceptions.NotFound: print("Transfer config not found.") else: print(f"Deleted transfer config: {transfer_config_name}")
def sample_update_transfer_config(config_name, display_name): # [START bigquerydatatransfer_update_config] from google.cloud import bigquery_datatransfer client = bigquery_datatransfer.DataTransferServiceClient() # TODO(developer): Set the config_name which user wants to update. # config_name = "your-created-transfer-config-name" # TODO(developer): Set the display_name of transfer_config. # config_name = "your-created-transfer-config-name" transfer_config = client.get_transfer_config(name=config_name) transfer_config.display_name = display_name field_mask = {"paths": ["display_name"]} response = client.update_transfer_config(transfer_config=transfer_config, update_mask=field_mask) print("Transfer config updated for '{}'".format(response.name)) # [END bigquerydatatransfer_update_config] # Return the config name for testing purposes, so that it can be deleted. return response
def cancel_active_transfers(project: str, transfer_config_name: str, transfer_location: str): client = bigquery_datatransfer.DataTransferServiceClient() play_store_transfer_config = client.location_transfer_config_path( project, transfer_location, transfer_config_name) transfer_run_iter = client.list_transfer_runs( play_store_transfer_config, states=[ transfer_enums.TransferState.PENDING, transfer_enums.TransferState.RUNNING ]) delete_count = 1 for page in transfer_run_iter.pages: for transfer_run in page: if delete_count % 20 == 0: # pause to avoid hitting api rate limit time.sleep(3) run_date = datetime.datetime.utcfromtimestamp( transfer_run.run_time.seconds).date() client.delete_transfer_run(transfer_run.name) delete_count += 1 print(f"Cancelled {transfer_run.name} for date {run_date}")
def copy_dataset(override_values={}): # [START bigquerydatatransfer_copy_dataset] from google.cloud import bigquery_datatransfer transfer_client = bigquery_datatransfer.DataTransferServiceClient() destination_project_id = "my-destination-project" destination_dataset_id = "my_destination_dataset" source_project_id = "my-source-project" source_dataset_id = "my_source_dataset" # [END bigquerydatatransfer_copy_dataset] # To facilitate testing, we replace values with alternatives # provided by the testing harness. destination_project_id = override_values.get("destination_project_id", destination_project_id) destination_dataset_id = override_values.get("destination_dataset_id", destination_dataset_id) source_project_id = override_values.get("source_project_id", source_project_id) source_dataset_id = override_values.get("source_dataset_id", source_dataset_id) # [START bigquerydatatransfer_copy_dataset] transfer_config = bigquery_datatransfer.TransferConfig( destination_dataset_id=destination_dataset_id, display_name="Your Dataset Copy Name", data_source_id="cross_region_copy", params={ "source_project_id": source_project_id, "source_dataset_id": source_dataset_id, }, schedule="every 24 hours", ) transfer_config = transfer_client.create_transfer_config( parent=transfer_client.common_project_path(destination_project_id), transfer_config=transfer_config, ) print(f"Created transfer config: {transfer_config.name}") # [END bigquerydatatransfer_copy_dataset] return transfer_config
def create_scheduled_query(override_values={}): # [START bigquerydatatransfer_create_scheduled_query] # [START bigquerydatatransfer_create_scheduled_query_with_service_account] from google.cloud import bigquery_datatransfer transfer_client = bigquery_datatransfer.DataTransferServiceClient() # The project where the query job runs is the same as the project # containing the destination dataset. project_id = "your-project-id" dataset_id = "your_dataset_id" # This service account will be used to execute the scheduled queries. Omit # this request parameter to run the query as the user with the credentials # associated with this client. service_account_name = "*****@*****.**" # [END bigquerydatatransfer_create_scheduled_query_with_service_account] # [END bigquerydatatransfer_create_scheduled_query] # To facilitate testing, we replace values with alternatives # provided by the testing harness. project_id = override_values.get("project_id", project_id) dataset_id = override_values.get("dataset_id", dataset_id) service_account_name = override_values.get("service_account_name", service_account_name) # [START bigquerydatatransfer_create_scheduled_query] # [START bigquerydatatransfer_create_scheduled_query_with_service_account] # Use standard SQL syntax for the query. query_string = """ SELECT CURRENT_TIMESTAMP() as current_time, @run_time as intended_run_time, @run_date as intended_run_date, 17 as some_integer """ parent = transfer_client.common_project_path(project_id) transfer_config = bigquery_datatransfer.TransferConfig( destination_dataset_id=dataset_id, display_name="Your Scheduled Query Name", data_source_id="scheduled_query", params={ "query": query_string, "destination_table_name_template": "your_table_{run_date}", "write_disposition": "WRITE_TRUNCATE", "partitioning_field": "", }, schedule="every 24 hours", ) transfer_config = transfer_client.create_transfer_config( bigquery_datatransfer.CreateTransferConfigRequest( parent=parent, transfer_config=transfer_config, service_account_name=service_account_name, )) print("Created scheduled query '{}'".format(transfer_config.name)) # [END bigquerydatatransfer_create_scheduled_query_with_service_account] # [END bigquerydatatransfer_create_scheduled_query] # Return the config name for testing purposes, so that it can be deleted. return transfer_config
def sample_create_transfer_config(project_id, dataset_id, authorization_code=""): # [START bigquerydatatransfer_create_scheduled_query] from google.cloud import bigquery_datatransfer client = bigquery_datatransfer.DataTransferServiceClient() # TODO(developer): Set the project_id to the project that contains the # destination dataset. # project_id = "your-project-id" # TODO(developer): Set the destination dataset. The authorized user must # have owner permissions on the dataset. # dataset_id = "your_dataset_id" # TODO(developer): The first time you run this sample, set the # authorization code to a value from the URL: # https://www.gstatic.com/bigquerydatatransfer/oauthz/auth?client_id=433065040935-hav5fqnc9p9cht3rqneus9115ias2kn1.apps.googleusercontent.com&scope=https://www.googleapis.com/auth/bigquery%20https://www.googleapis.com/auth/drive&redirect_uri=urn:ietf:wg:oauth:2.0:oob # # authorization_code = "_4/ABCD-EFGHIJKLMNOP-QRSTUVWXYZ" # # You can use an empty string for authorization_code in subsequent runs of # this code sample with the same credentials. # # authorization_code = "" # Use standard SQL syntax for the query. query_string = """ SELECT CURRENT_TIMESTAMP() as current_time, @run_time as intended_run_time, @run_date as intended_run_date, 17 as some_integer """ parent = f"projects/{project_id}" transfer_config = bigquery_datatransfer.TransferConfig( destination_dataset_id=dataset_id, display_name="Your Scheduled Query Name", data_source_id="scheduled_query", params={ "query": query_string, "destination_table_name_template": "your_table_{run_date}", "write_disposition": "WRITE_TRUNCATE", "partitioning_field": "", }, schedule="every 24 hours", ) response = client.create_transfer_config( request={ "parent": parent, "transfer_config": transfer_config, "authorization_code": authorization_code, }) print("Created scheduled query '{}'".format(response.name)) # [END bigquerydatatransfer_create_scheduled_query] # Return the config name for testing purposes, so that it can be deleted. return response.name
# Modified version of the python example for "scheduled query" # https://cloud.google.com/bigquery/docs/scheduling-queries#setting_up_a_scheduled_query # # Current cli user must be authenticated and have the propper authrorization to BQDT from google.cloud import bigquery_datatransfer client = bigquery_datatransfer.DataTransferServiceClient() # TODO: Replace variables before attempting project = '' source_dataset_id = '' destination_dataset_id = '' display_name = 'Python Data Transfer final' parent = f"projects/{project}" transfer_config = bigquery_datatransfer.TransferConfig( destination_dataset_id=destination_dataset_id, display_name=display_name, data_source_id="cross_region_copy", params={ "source_dataset_id": source_dataset_id, "source_project_id": project, "overwrite_destination_table": "true" }, ) response = client.create_transfer_config( request={ "parent": parent,
from google.cloud import bigquery, bigquery_datatransfer from google.oauth2 import service_account import datetime import functions as f import os # get credentials using the client_secrets.json which contains my service account details credentials = service_account.Credentials.from_service_account_file( 'client_secrets.json', scopes=["https://www.googleapis.com/auth/cloud-platform"], ) client = bigquery.Client( credentials=credentials, project=credentials.project_id, ) client_transfer = bigquery_datatransfer.DataTransferServiceClient( credentials=credentials) # sql query to create a view in the gold dataset sql = """ SELECT warehouse_name, warehouse_id, country, product_id, inventory_quantity, reporting_month FROM `zapp-case-study.silver_warehouse.warehouse_product_overview` wpo """ # This view will live in the gold Dataset dataset = 'gold_warehouse'
def bqdts_client(credentials): from google.cloud import bigquery_datatransfer return bigquery_datatransfer.DataTransferServiceClient( credentials=credentials)
def execute(self, data): # setup client = self.client bigquery = self.bigquery bigquery_datatransfer = self.bigquery_datatransfer datetime = self.datetime pytz = self.pytz time = self.time name = data.get("titleName") emails = data.get("emails") query = data.get("query") table = "" # # create a dataset first if needed dataset_main = self.make_dataset() table_id = "{}.{}".format(dataset_main, name) # #create a table if needed table= self.make_table(table_id) # # an interactive query if(self.env.get("interactive")): try: """ query = SELECT name, SUM(number) as total_people FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE state = 'TX' GROUP BY name, state ORDER BY total_people DESC LIMIT 20 """ query_job = client.query(query) # Make an API request. return json.dumps({ "schema":[{"field":x} for x in ["Name","Count"]], "data":[ # Row values can be accessed by field name or index. { "Name":row[0], "Count":row["total_people"] } for row in query_job ] }) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # a batch query elif(self.env.get("batch")): try: job_config = bigquery.QueryJobConfig( # Run at batch priority, which won't count toward concurrent rate limit. priority=bigquery.QueryPriority.BATCH ) """ query = SELECT corpus FROM `bigquery-public-data.samples.shakespeare` GROUP BY corpus; """ # Start the query, passing in the extra configuration. query_job = client.query(query, job_config=job_config) # Make an API request. query_job = client.get_job( query_job.job_id, location=query_job.location ) # Make an API request. return "Job {} is currently in state {}".format(query_job.job_id, query_job.state) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # a dry run elif (self.env.get("dry_run")): try: """ query SELECT name, COUNT(*) as name_count FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE state = 'WA' GROUP BY name """ job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False) # Start the query, passing in the extra configuration. query_job = client.query( query, job_config=job_config, ) # Make an API request. # A dry run query completes immediately. return "This query will process {} megabytes.".format(query_job.total_bytes_processed/1000000) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # write to a permanent table elif(self.env.get("permanent_table")): try: """ query SELECT corpus FROM `bigquery-public-data.samples.shakespeare` GROUP BY corpus; """ job_config = bigquery.QueryJobConfig(destination=table_id) # Start the query, passing in the extra configuration. query_job = client.query(query, job_config=job_config) # Make an API request. # fr queries that return massive results using legacy SQL # job_config = bigquery.QueryJobConfig( # allow_large_results=True, destination=table_id, use_legacy_sql=True # ) query_job.result() # Wait for the job to complete. return "Query results loaded to the table {}".format(table_id) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # query caching elif (self.env.get("cache_enable/disable")): try: """ query SELECT corpus FROM `bigquery-public-data.samples.shakespeare` GROUP BY corpus; """ job_config = bigquery.QueryJobConfig( use_query_cache=False # to disable # createDisposition="CREATE_NEVER" # to enable ) query_job = client.query(query, job_config=job_config) # Make an API request. query_job.result() return "was this query cached? {}".format(query_job.cache_hit) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # regular parameterized elif(self.env.get("parameterized")): try: """ query = SELECT word, word_count FROM `bigquery-public-data.samples.shakespeare` WHERE corpus = @corpus AND word_count >= @min_word_count ORDER BY word_count DESC; """ job_config = bigquery.QueryJobConfig( query_parameters=[ bigquery.ScalarQueryParameter("corpus", "STRING", "romeoandjuliet"), bigquery.ScalarQueryParameter("min_word_count", "INT64", 250), ] ) query_job = client.query(query, job_config=job_config) # Make an API request. return json.dumps({ "schema":[{"field":x} for x in ["word","word_count"]], "data":[ # Row values can be accessed by field name or index. { "word":row["word"], "word_count":row["word_count"] } for row in query_job ] }) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # array parameterized elif(self.env.get("array_parameterized")): try: schema = ["name","count"] """query = SELECT name, sum(number) as count FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE gender = @gender AND state IN UNNEST(@states) GROUP BY name ORDER BY count DESC LIMIT 10; """ job_config = bigquery.QueryJobConfig( query_parameters=[ bigquery.ScalarQueryParameter("gender", "STRING", "M"), bigquery.ArrayQueryParameter("states", "STRING", ["WA", "WI", "WV", "WY"]), ] ) query_job = client.query(query, job_config=job_config) # Make an API request. return json.dumps({ "schema":[{"field":x} for x in schema], "data":[ # Row values can be accessed by field name or index. { schema[0]:row[schema[0]], schema[1]:row[schema[1]] } for row in query_job ] }) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # timestamp parameters elif(self.env.get("timestamp_parameterized")): try: schema = ["date","f0_"] """query = SELECT TIMESTAMP_ADD(@ts_value, INTERVAL 1 HOUR); """ job_config = bigquery.QueryJobConfig( query_parameters=[ bigquery.ScalarQueryParameter( "ts_value", "TIMESTAMP", datetime.datetime(2016, 12, 7, 8, 0, tzinfo=pytz.UTC), ) ] ) query_job = client.query(query, job_config=job_config) # Make an API request. return json.dumps( { "schema":[{"field":x} for x in schema], "data":[ # Row values can be accessed by field name or index. { "date":row[0].strftime("%m/%d/%Y, %H:%M:%S"), schema[1]:row[schema[1]] } for row in query_job ] }, default=str ) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # struct parameters elif(self.env.get("struct_parameterized")): try: schema = ["s"] """ query = SELECT @struct_value AS s; """ job_config = bigquery.QueryJobConfig( query_parameters=[ bigquery.StructQueryParameter( "struct_value", bigquery.ScalarQueryParameter("x", "INT64", 1), bigquery.ScalarQueryParameter("y", "STRING", "foo"), ) ] ) query_job = client.query(query, job_config=job_config) # Make an API request. return json.dumps({ "schema":[{"field":x} for x in schema], "data":[ # Row values can be accessed by field name or index. { schema[0]:str(row[schema[0]]), } for row in query_job ] }) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # wildcard elif (self.env.get("mutliple_by_wildcard")): try: job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False) # replace the asterick with a number or change the table suffixes to see the amount of data used """ query SELECT max, ROUND((max-32)*5/9,1) celsius, mo, da, year FROM `bigquery-public-data.noaa_gsod.gsod194*` WHERE max != 9999.9 # code for missing data AND ( _TABLE_SUFFIX = '0' OR _TABLE_SUFFIX = '4' ) ORDER BY max DESC """ # Start the query, passing in the extra configuration. query_job = client.query( query , job_config=job_config, ) # Make an API request. # A dry run query completes immediately. return "This query will process {} megabytes.".format(query_job.total_bytes_processed/1000000) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # scheduled query # using bigquery datatransfer, the names are not joined as much elif(self.env.get("scheduled")): try: # Use standard SQL syntax for the query. """ query = SELECT CURRENT_TIMESTAMP() as current_time, @run_time as intended_run_time, @run_date as intended_run_date, 17 as some_integer """ transfer_client = bigquery_datatransfer.DataTransferServiceClient() project_id = client.project dataset_id = self.dataset_names[0] # This service account will be used to execute the scheduled queries. Omit # this request parameter to run the query as the user with the credentials # associated with this client. remember roles and permissions # service_account_name = "*****@*****.**" parent = transfer_client.common_project_path(project_id) transfer_config = bigquery_datatransfer.TransferConfig( destination_dataset_id=dataset_id, display_name="Your Scheduled Query Name", data_source_id="scheduled_query", params={ "query": query, "destination_table_name_template": "your_table_{run_date}", "write_disposition": "WRITE_APPEND", "partitioning_field": "", }, schedule="every 24 hours", ) transfer_config = transfer_client.create_transfer_config( bigquery_datatransfer.CreateTransferConfigRequest( parent=parent, transfer_config=transfer_config, # service_account_name=service_account_name, ) ) return "Created scheduled query '{}'".format(transfer_config.name) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # check schedule status # you see them ins scheduled queries elif(self.env.get("scheduled_status")): try: transfer_client = bigquery_datatransfer.DataTransferServiceClient() project_id = client.project parent = transfer_client.common_project_path(project_id) configs = transfer_client.list_transfer_configs(parent=parent) result = "Got the following configs:" for config in configs: result += f"\n\tID: {config.name}, Schedule: {config.schedule}" return result except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # manual runs # used to immediatley execute a query elif (self.env.get("manual_run")): try: transfer_client = bigquery_datatransfer.DataTransferServiceClient() transfer_config_name = query now = datetime.datetime.now(datetime.timezone.utc) start_time = now - datetime.timedelta(days=5) end_time = now - datetime.timedelta(days=2) # Some data sources, such as scheduled_query only support daily run. # Truncate start_time and end_time to midnight time (00:00AM UTC). start_time = datetime.datetime( start_time.year, start_time.month, start_time.day, tzinfo=datetime.timezone.utc ) end_time = datetime.datetime( end_time.year, end_time.month, end_time.day, tzinfo=datetime.timezone.utc ) response = transfer_client.schedule_transfer_runs( parent=transfer_config_name, start_time=start_time, end_time=end_time, ) result = "Started transfer runs:" for run in response.runs: result += f"\n\tbackfill: {run.run_time} run: {run.name}" return result except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # delete scheduled query elif(self.env.get("scheduled_delete")): try: import google.api_core.exceptions transfer_client = bigquery_datatransfer.DataTransferServiceClient() transfer_config_name = query transfer_client.delete_transfer_config(name=transfer_config_name) return f"Deleted transfer config: {transfer_config_name}" except google.api_core.exceptions.NotFound: return "Transfer config not found." except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # return "Check the backend env dictionary you did set it so the backend didnt do anything"
def execute(self, name): # create a dataset if (self.env.get("create")): dataset_id = self.make_dataset_id(name) dataset_init = bigquery.Dataset(dataset_id) dataset_init.location = "US" # multi-region try: if (name == ""): raise IndexError dataset = client.create_dataset(dataset_init, timeout=30) return "Created dataset {}.{}".format(client.project, dataset.dataset_id) except IndexError: return "Please input a name" except BaseException as e: print("\nlook here\n") print(e) return "Dataset already exists choose another name" # # copy a dataset elif (self.env.get("copy")): transfer_client = bigquery_datatransfer.DataTransferServiceClient() # create a source dataset source_project_id = client.project source_dataset_id = "my_source_dataset" dataset_id = self.make_dataset_id(source_dataset_id) dataset_init = bigquery.Dataset(dataset_id) try: dataset = client.create_dataset(dataset_init, timeout=30) except BaseException as e: print("") #no-op # try: destination_project_id = client.project destination_dataset_id = name if (name == ""): raise IndexError transfer_config = bigquery_datatransfer.TransferConfig( destination_dataset_id=destination_dataset_id, display_name="Your Dataset Copy Name", data_source_id="cross_region_copy", params={ "source_project_id": source_project_id, "source_dataset_id": source_dataset_id, }, # schedule="from ", ) transfer_config = transfer_client.create_transfer_config( parent=transfer_client.common_project_path( destination_project_id), transfer_config=transfer_config, ) return "Created transfer config: {}".format( transfer_config.name) except BaseException as e: print("\nlook here\n") print(e) return "Was not able to make a copy mabye check the terminal where the python server is running" # # access control elif (self.env.get("access_control")): try: dataset_id = self.make_dataset_id(name) dataset = client.get_dataset(dataset_id) entry = bigquery.AccessEntry( role="READER", entity_type="userByEmail", entity_id="*****@*****.**", ) entries = list(dataset.access_entries) entries.append(entry) dataset.access_entries = entries dataset = client.update_dataset(dataset, ["access_entries"]) full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id) return "Updated dataset '{}' with modified user permissions.".format( full_dataset_id) except BaseException as e: print("\nlook here\n") print(e) return "An error occured" # # list all datasets elif (self.env.get("list")): try: datasets = list(client.list_datasets()) project = client.project value = "datasets in project {}".format(project) if datasets: for dataset in datasets: value += "\n{}".format(dataset.dataset_id) return value else: return "{} project does not contain any datasets.".format( project) except BaseException as e: print("\nlook here\n") print(e) return "An error occured" # # metadata elif (self.env.get("metadata")): value = "Metadata:\n" try: # get the dataset friendly name dataset_id = self.make_dataset_id(name) dataset = client.get_dataset(dataset_id) full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id) value += "{}: {}\n".format("Friendly Name", dataset.friendly_name) # # more properties value += "{}: {}\n".format("Description", dataset.description) value += "Labels:\n" labels = dataset.labels if labels: for label, value in labels.items(): value += "\n{}: {}".format(label, value) else: value += "\tDataset has no labels defined.\n" # # View tables value += "Tables:" tables = list(client.list_tables(dataset)) if tables: for table in tables: value += "\n{}:".format(table.table_id) else: value += "\tDataset does not have tables.\n" # return value except BaseException as e: print("\nlook here\n") print(e) return "An error occured" # # update dataset elif (self.env.get("update")): value = "" try: # update dataset description dataset_id = self.make_dataset_id(name) dataset = client.get_dataset( dataset_id) # Make an API request. print(dataset) dataset.description = "Updated description." dataset = client.update_dataset( dataset, ["description"]) # Make an API request. full_dataset_id = "{}.{}".format(dataset.project, dataset.dataset_id) value += "Updated dataset '{}' with description '{}'.\n".format( full_dataset_id, dataset.description) # # update default table expiration dataset.default_table_expiration_ms = 24 * 60 * 60 * 1000 # In milliseconds means in 24 hours dataset = client.update_dataset( dataset, ["default_table_expiration_ms"]) value += "Updated dataset '{}' with table expiration '{}'.\n".format( full_dataset_id, dataset.default_table_expiration_ms) # # update default partition expiration dataset.default_partition_expiration_ms = 24 * 60 * 60 * 1000 # In milliseconds means in 24 hours dataset = client.update_dataset( dataset, ["default_partition_expiration_ms"]) value += "Updated dataset '{}' with table partition expiration '{}'.\n".format( full_dataset_id, dataset.default_partition_expiration_ms) # return value except IndexError: return "Provide the dataset name" except BaseException as e: print("\nlook here\n") print(e) return "An error occured" # # delete dataset elif (self.env.get("delete")): try: dataset_id = self.make_dataset_id(name) client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=False) return "Deleted dataset '{}'.".format(dataset_id) except IndexError: return "Provide the dataset name" except BaseException as e: print("\nlook here\n") print(e.__class__.__name__) if (e.__class__.__name__ == "NotFound"): return "Could not find dataset with name {} to delete".format( dataset_id) return "An error occured"