def load(project_id, bq_client, src_dataset_id, dst_dataset_id): """ Transform safely loaded tables and store results in target dataset. :param project_id: Identifies the BQ project :param bq_client: a BigQuery client object :param src_dataset_id: reference to source dataset object :param dst_dataset_id: reference to destination dataset object :return: List of BQ job_ids """ dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}') dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}' dst_dataset.labels = {'type': 'vocabulary'} dst_dataset.location = "US" bq_client.create_dataset(dst_dataset, exists_ok=True) src_tables = list(bq_client.list_tables(dataset=src_dataset_id)) job_config = QueryJobConfig() query_jobs = [] for src_table in src_tables: schema = bq.get_table_schema(src_table.table_id) destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}' table = bq_client.create_table(Table(destination, schema=schema), exists_ok=True) job_config.destination = table query = SELECT_TPL.render(project_id=project_id, dataset_id=src_dataset_id, table=src_table.table_id, fields=schema) query_job = bq_client.query(query, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{query_job.job_id}') query_jobs.append(query_job) query_job.result() return query_jobs
def schema_upgrade_cdm52_to_cdm531(project_id, dataset_id, snapshot_dataset_id, hpo_id=None): """ :param project_id: :param dataset_id: Dataset to convert :param snapshot_dataset_id: Dataset with converted tables. Overwritten if tables already exist :param hpo_id: Identifies the hpo_id of the site :return: """ # Create dataset if not exists client = bq.get_client(project_id) client.create_dataset(snapshot_dataset_id, exists_ok=True) sq.create_empty_cdm_tables(snapshot_dataset_id, hpo_id) copy_table_job_ids = [] tables = [table.table_id for table in list(client.list_tables(dataset_id))] if hpo_id: hpo_tables = [ resources.get_table_id(table, hpo_id) for table in resources.CDM_TABLES + PII_TABLES ] # Filter tables that do not exist tables = [table for table in hpo_tables if table in tables] for table_id in tables: q = get_upgrade_table_query(client, dataset_id, table_id, hpo_id) job_config = QueryJobConfig() job_config.destination = f'{client.project}.{snapshot_dataset_id}.{table_id}' job_config.use_legacy_sql = False job = client.query(q, job_config) copy_table_job_ids.append(job.job_id) job.result() return copy_table_job_ids
def test_fill_from_default_conflict(self): from google.cloud.bigquery import QueryJobConfig basic_job_config = QueryJobConfig() conflicting_job_config = self._make_one("conflicting_job_type") self.assertNotEqual(basic_job_config._job_type, conflicting_job_config._job_type) with self.assertRaises(TypeError): basic_job_config._fill_from_default(conflicting_job_config)
def test_create_udf(self, udf_path): client = self._client bq_test_dataset = utils.get_target_bq_dataset(udf_path) job_config = QueryJobConfig() job_config.default_dataset = (f'{client.project}.{bq_test_dataset}') try: udf_sql = utils.replace_with_test_datasets(udf_path, client.project) udf_creation_result = client.query(udf_sql, job_config=job_config).result() self.assertIsInstance(udf_creation_result, _EmptyRowIterator) except GoogleAPICallError as e: self.fail(e.message)
def bq_create_table_as_select(google_client, dataset_id, table_name, query): table_ref = google_client.dataset(dataset_id).table(table_name) job_config = QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = "WRITE_TRUNCATE" query_job = google_client.query(query=query, job_config=job_config) retry_count = 100 while retry_count > 0 and query_job.state != 'DONE': retry_count -= 1 time.sleep(3) query_job.reload() # API call logging.info("job state : %s " % (query_job.state)) logging.info("job state : %s at %s" % (query_job.state, query_job.ended))
def test_create_udf(self, udf_path): client = bigquery.Client() bq_test_dataset = Utils.get_target_bq_dataset(udf_path) client.create_dataset(bq_test_dataset, exists_ok=True) job_config = QueryJobConfig() job_config.default_dataset = (f'{client.project}.{bq_test_dataset}') with open(udf_path) as udf_file: try: udf_sql = Utils.replace_with_test_datasets( udf_path, client.project) udf_creation_result = client.query( udf_sql, job_config=job_config).result() self.assertIsInstance(udf_creation_result, _EmptyRowIterator) except GoogleAPICallError as e: self.fail(e.message)
def test_create_udf_signature(self, udf_path): client = self._client bq_test_dataset = utils.get_target_bq_dataset(udf_path) job_config = QueryJobConfig() job_config.default_dataset = (f'{client.project}.{bq_test_dataset}') udf_signature = utils.extract_udf_signature(udf_path) udf_sql = utils.replace_with_test_datasets( project_id=client.project, udf_sql=f'CREATE OR REPLACE FUNCTION {udf_signature} AS (NULL)') try: udf_creation_result = client.query(udf_sql, job_config=job_config).result() self.assertIsInstance(udf_creation_result, _EmptyRowIterator) except GoogleAPICallError as e: self.fail(e.message)
def __init__(self, dataset: str): qjc: QueryJobConfig = QueryJobConfig( default_dataset=f"{os.environ.get('PROJECT')}.{dataset}") self.client = bigquery.Client(default_query_job_config=qjc) self.dataset = dataset self.dcl = "datasetchangelog" self.destination = f"{self.client.project}.{self.dataset}.{self.dcl}" self.logger = logging.getLogger(__name__)
def test_fill_from_default(self): from google.cloud.bigquery import QueryJobConfig job_config = QueryJobConfig() job_config.dry_run = True job_config.maximum_bytes_billed = 1000 default_job_config = QueryJobConfig() default_job_config.use_query_cache = True default_job_config.maximum_bytes_billed = 2000 final_job_config = job_config._fill_from_default(default_job_config) self.assertTrue(final_job_config.dry_run) self.assertTrue(final_job_config.use_query_cache) self.assertEqual(final_job_config.maximum_bytes_billed, 1000)
def load(project_id, bq_client, src_dataset_id, dst_dataset_id, overwrite_ok=False): """ Transform safely loaded tables and store results in target dataset. :param project_id: :param bq_client: :param src_dataset_id: :param dst_dataset_id: :param overwrite_ok: if True and the dest dataset already exists the dataset is recreated :return: """ if overwrite_ok: bq_client.delete_dataset(dst_dataset_id, delete_contents=True, not_found_ok=True) bq_client.create_dataset(dst_dataset_id) src_tables = list(bq_client.list_tables(dataset=src_dataset_id)) job_config = QueryJobConfig() query_jobs = [] for src_table in src_tables: schema = bq.get_table_schema(src_table.table_id) destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}' table = bq_client.create_table(Table(destination, schema=schema), exists_ok=True) job_config.destination = table query = SELECT_TPL.render(project_id=project_id, dataset_id=src_dataset_id, table=src_table.table_id, fields=schema) query_job = bq_client.query(query, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{query_job.job_id}') query_jobs.append(query_job) return query_jobs
def process(self, app_numbers): sql = """ SELECT application_number, application_kind, grant_date FROM `patents-public-data.patents.publications` WHERE country_code = @us_country_code AND application_number IN UNNEST(@application_numbers) AND IF ( publication_date >= @wipo_kind_codes_from, kind_code IN UNNEST(@wipo_patent_publication_codes), kind_code = @uspto_patent_publication_code ); """ job_config = QueryJobConfig(query_parameters=[ ScalarQueryParameter( 'us_country_code', 'STRING', US_COUNTRY_CODE, ), ArrayQueryParameter( 'application_numbers', 'STRING', app_numbers, ), ScalarQueryParameter( 'wipo_kind_codes_from', 'INT64', WIPO_KIND_CODES_FROM, ), ArrayQueryParameter( 'wipo_patent_publication_codes', 'STRING', WIPO_PATENT_PUBLICATION_CODES, ), ScalarQueryParameter( 'uspto_patent_publication_code', 'STRING', USPTO_PATENT_PUBLICATION_CODE, ), ]) query = self.storage_client.query(sql, job_config=job_config) logging.info('Executing query for publications') iterator = query.result() return iterator
def run_query(self, query, job_id_prefix=None, wait=False, use_query_cache=True): job_config = QueryJobConfig(use_query_cache=use_query_cache) query_job = self.bq_client.query(query, job_id_prefix=job_id_prefix, job_config=job_config) print(f'Job {query_job.job_id} started.') query_job.add_done_callback(self.done_cb) if wait: query_job.result() return query_job
def _set_maximum_bytes_billed( self, job_config: bigquery.QueryJobConfig, maximum_gbytes_billed: Optional[float] ) -> bigquery.QueryJobConfig: """クエリの処理容量制限を設定する GBからByteに変換して登録する Arguments: job_config {bigquery.QueryJobConfig} -- ジョブ設定 maximum_gbytes_billed {Optional[float]} -- GB上限 Returns: bigquery.QueryJobConfig -- 設定ずみジョブ設定 """ maximum_gbytes_billed = ( maximum_gbytes_billed if self.maximum_gbytes_billed is None else self.maximum_gbytes_billed ) if maximum_gbytes_billed is not None: maximum_bytes_billed = int(maximum_gbytes_billed * 1024 ** 3) job_config.maximum_bytes_billed = maximum_bytes_billed return job_config
def test_serialization(): bigquery_task = BigQueryTask( name="flytekit.demo.bigquery_task.query", inputs=kwtypes(ds=str), task_config=BigQueryConfig( ProjectID="Flyte", Location="Asia", QueryJobConfig=QueryJobConfig(allow_large_results=True) ), query_template=query_template, output_structured_dataset_type=StructuredDataset, ) @workflow def my_wf(ds: str) -> StructuredDataset: return bigquery_task(ds=ds) default_img = Image(name="default", fqn="test", tag="tag") serialization_settings = SerializationSettings( project="proj", domain="dom", version="123", image_config=ImageConfig(default_image=default_img, images=[default_img]), env={}, ) task_spec = get_serializable(OrderedDict(), serialization_settings, bigquery_task) assert "SELECT * FROM `bigquery-public-data.crypto_dogecoin.transactions`" in task_spec.template.sql.statement assert "@version" in task_spec.template.sql.statement assert task_spec.template.sql.dialect == task_spec.template.sql.Dialect.ANSI s = Struct() s.update({"ProjectID": "Flyte", "Location": "Asia", "allowLargeResults": True}) assert task_spec.template.custom == json_format.MessageToDict(s) assert len(task_spec.template.interface.inputs) == 1 assert len(task_spec.template.interface.outputs) == 1 admin_workflow_spec = get_serializable(OrderedDict(), serialization_settings, my_wf) assert admin_workflow_spec.template.interface.outputs["o0"].type.structured_dataset_type is not None assert admin_workflow_spec.template.outputs[0].var == "o0" assert admin_workflow_spec.template.outputs[0].binding.promise.node_id == "n0" assert admin_workflow_spec.template.outputs[0].binding.promise.var == "results"
def run_sql(self, query: str) -> BigQueryRows: bigquery = self._bigquery(self.credentials.project_id) if log.isEnabledFor(logging.DEBUG): log.debug('Query (%r characters total): %r', len(query), self._trunc_query(query)) if config.bigquery_batch_mode: job_config = QueryJobConfig(priority=QueryPriority.BATCH) job: QueryJob = bigquery.query(query, job_config=job_config) result = job.result() else: delays = (10, 20, 40, 80) assert sum(delays) < config.contribution_lambda_timeout( retry=False) for attempt, delay in enumerate((*delays, None)): job: QueryJob = bigquery.query(query) try: result = job.result() except (Forbidden, InternalServerError, ServiceUnavailable) as e: if delay is None: raise e elif isinstance( e, Forbidden ) and 'Exceeded rate limits' not in e.message: raise e else: log.warning( 'BigQuery job error during attempt %i/%i. Retrying in %is.', attempt + 1, len(delays) + 1, delay, exc_info=e) sleep(delay) else: break else: assert False if log.isEnabledFor(logging.DEBUG): log.debug('Job info: %s', json.dumps(self._job_info(job))) return result
def __create_job_config( self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig: job_config = QueryJobConfig() job_config.priority = ems_query_job_config.priority.value job_config.use_legacy_sql = False job_config.use_query_cache = ems_query_job_config.use_query_cache job_config.labels = ems_query_job_config.labels if ems_query_job_config.destination_table is not None: job_config.time_partitioning = TimePartitioning("DAY") table_reference = TableReference( DatasetReference( ems_query_job_config.destination_project_id or self.__project_id, ems_query_job_config.destination_dataset), ems_query_job_config.destination_table) job_config.destination = table_reference job_config.write_disposition = ems_query_job_config.write_disposition.value job_config.create_disposition = ems_query_job_config.create_disposition.value partitioning = ems_query_job_config.time_partitioning if partitioning is not None: job_config.time_partitioning = TimePartitioning( partitioning.type.value, partitioning.field, partitioning.expiration_ms, partitioning.require_partition_filter) if ems_query_job_config.table_definitions is not None: job_config.table_definitions = ems_query_job_config.table_definitions return job_config
def _config_query(self, use_legacy_sql): job_config = QueryJobConfig() job_config.destination = self.temp_table job_config.use_legacy_sql = use_legacy_sql job_config.allow_large_results = True return job_config
def load_from_github(repository_name): """ Load repository information from Github such as pull requests and contributors :param repository_name: :return: """ logger.debug(datetime.datetime.now().strftime("%H:%M:%S") + "Loading from github") repo_api = get_repo_api(repository_name) try: full_name = repo_api.full_name except UnknownObjectException: return None repo = Repository(id=repo_api.id, full_name=full_name, name=repo_api.name) repo.language, created = Language.objects.get_or_create( name=repo_api.language) repo.save() logger.debug(datetime.datetime.now().strftime("%H:%M:%S") + "Getting contributors") contributor_counter = len(list(repo_api.get_contributors())) repo.contributors_count = contributor_counter if config.GET_CONTRIBUTORS_DATA: for contrib in repo_api.get_contributors(): contributor_counter += 1 try: contributor_db = Contributor.objects.get( login__exact=contrib.login) except ObjectDoesNotExist: contributor_db = Contributor() contributor_db.login = contrib.login contributor_db.followers_count = contrib.followers contributor_db.url = contrib.html_url contributor_db.save() contribution_db = Contribution(repository=repo, contributor=contributor_db, amount=contrib.contributions) contribution_db.save() logger.debug(datetime.datetime.now().strftime("%H:%M:%S") + "Getting pull request Data") if config.USE_BIGQUERY: bigquery_client: bigquery.Client = bigquery.Client.from_service_account_json( "socialpatterns-c03d755a739c.json") repo_url_bigquery = repo_api.html_url.replace("github.com", "api.github.com/repos") query_config = QueryJobConfig() query_config.use_legacy_sql = False query_text = """ SELECT Count(*) AS Pull_Request , (SELECT Count(*) FROM `ghtorrent-bq.ght_2018_04_01.issue_comments` WHERE issue_id IN (SELECT id FROM `ghtorrent-bq.ght_2018_04_01.issues` WHERE pull_request_id IN (SELECT id FROM `ghtorrent-bq.ght_2018_04_01.pull_requests` WHERE base_repo_id = (SELECT id FROM `ghtorrent-bq.ght_2018_04_01.projects` AS pj WHERE pj.url ="%s" LIMIT 1 )) )) AS Comments FROM `ghtorrent-bq.ght_2018_04_01.pull_requests` WHERE base_repo_id = (SELECT id FROM `ghtorrent-bq.ght_2018_04_01.projects` AS pj WHERE pj.url="%s" LIMIT 1 ) """ % ( repo_url_bigquery, repo_url_bigquery) query_job = bigquery_client.query(query_text, job_config=query_config) pr_number = list(query_job.result())[0][0] comments = list(query_job.result())[0][1] else: if config.CHECK_CLOSED_PR: pull_requests = repo_api.get_pulls(state="all") else: pull_requests = repo_api.get_pulls() pr_number = len(list(pull_requests)) comments = 0 for pr in pull_requests: try: comments += pr.comments except ssl.SSLError: logger.error("Read timeout when getting comments") repo.comments_count = comments repo.pull_request_count = pr_number repo.save() return repo
def run_query(self, query, **kwargs): from google.auth.exceptions import RefreshError from google.cloud.bigquery import QueryJobConfig from concurrent.futures import TimeoutError job_config = { 'query': { 'useLegacySql': self.dialect == 'legacy' # 'allowLargeResults', 'createDisposition', # 'preserveNulls', destinationTable, useQueryCache } } config = kwargs.get('configuration') if config is not None: if len(config) != 1: raise ValueError("Only one job type must be specified, but " "given {}".format(','.join(config.keys()))) if 'query' in config: if 'query' in config['query']: if query is not None: raise ValueError("Query statement can't be specified " "inside config while it is specified " "as parameter") query = config['query']['query'] del config['query']['query'] job_config['query'].update(config['query']) else: raise ValueError("Only 'query' job type is supported") self._start_timer() try: self._print('Requesting query... ', end="") query_reply = self.client.query( query, job_config=QueryJobConfig.from_api_repr(job_config['query'])) self._print('ok.') except (RefreshError, ValueError): if self.private_key: raise AccessDenied( "The service account credentials are not valid") else: raise AccessDenied( "The credentials have been revoked or expired, " "please re-run the application to re-authorize") except self.http_error as ex: self.process_http_error(ex) job_id = query_reply.job_id self._print('Job ID: %s\nQuery running...' % job_id) while query_reply.state != 'DONE': self.print_elapsed_seconds(' Elapsed', 's. Waiting...') timeout_ms = job_config['query'].get('timeoutMs') if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000: raise QueryTimeout('Query timeout: {} ms'.format(timeout_ms)) timeout_sec = 1.0 if timeout_ms: # Wait at most 1 second so we can show progress bar timeout_sec = min(1.0, timeout_ms / 1000.0) try: query_reply.result(timeout=timeout_sec) except TimeoutError: # Use our own timeout logic pass except self.http_error as ex: self.process_http_error(ex) if self.verbose: if query_reply.cache_hit: self._print('Query done.\nCache hit.\n') else: bytes_processed = query_reply.total_bytes_processed or 0 bytes_billed = query_reply.total_bytes_billed or 0 self._print('Query done.\nProcessed: {} Billed: {}'.format( self.sizeof_fmt(bytes_processed), self.sizeof_fmt(bytes_billed))) self._print('Standard price: ${:,.2f} USD\n'.format( bytes_billed * self.query_price_for_TB)) self._print('Retrieving results...') try: rows_iter = query_reply.result() except self.http_error as ex: self.process_http_error(ex) result_rows = list(rows_iter) total_rows = rows_iter.total_rows schema = { 'fields': [field.to_api_repr() for field in rows_iter.schema], } # print basic query stats self._print('Got {} rows.\n'.format(total_rows)) return schema, result_rows
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions: # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table( dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge_{task}.sql'.format( task=task)) merge_sql_template = read_file(merge_sql_path) template_context['params']['source_table'] = temp_table_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def dry_run_query(self, query): job_config = QueryJobConfig() job_config.dry_run = True job_config.use_query_cache = False return self.client.query(query=(query), job_config=job_config)
def parse_url(url): # noqa: C901 query = dict(url.query) # need mutable query. # use_legacy_sql (legacy) if "use_legacy_sql" in query: raise ValueError("legacy sql is not supported by this dialect") # allow_large_results (legacy) if "allow_large_results" in query: raise ValueError( "allow_large_results is only allowed for legacy sql, which is not supported by this dialect" ) # flatten_results (legacy) if "flatten_results" in query: raise ValueError( "flatten_results is only allowed for legacy sql, which is not supported by this dialect" ) # maximum_billing_tier (deprecated) if "maximum_billing_tier" in query: raise ValueError("maximum_billing_tier is a deprecated argument") project_id = url.host location = None dataset_id = url.database or None arraysize = None credentials_path = None # location if "location" in query: location = query.pop("location") # credentials_path if "credentials_path" in query: credentials_path = query.pop("credentials_path") # arraysize if "arraysize" in query: str_arraysize = query.pop("arraysize") try: arraysize = int(str_arraysize) except ValueError: raise ValueError("invalid int in url query arraysize: " + str_arraysize) # if only these "non-config" values were present, the dict will now be empty if not query: # if a dataset_id exists, we need to return a job_config that isn't None # so it can be updated with a dataset reference from the client if dataset_id: return ( project_id, location, dataset_id, arraysize, credentials_path, QueryJobConfig(), ) else: return project_id, location, dataset_id, arraysize, credentials_path, None job_config = QueryJobConfig() # clustering_fields list(str) if "clustering_fields" in query: clustering_fields = GROUP_DELIMITER.split(query["clustering_fields"]) job_config.clustering_fields = list(clustering_fields) # create_disposition if "create_disposition" in query: create_disposition = query["create_disposition"] try: job_config.create_disposition = getattr(CreateDisposition, create_disposition) except AttributeError: raise ValueError("invalid create_disposition in url query: " + create_disposition) # default_dataset if "default_dataset" in query or "dataset_id" in query or "project_id" in query: raise ValueError( "don't pass default_dataset, dataset_id, project_id in url query, instead use the url host and database" ) # destination if "destination" in query: dest_project = None dest_dataset = None dest_table = None try: dest_project, dest_dataset, dest_table = query[ "destination"].split(".") except ValueError: raise ValueError( "url query destination parameter should be fully qualified with project, dataset, and table" ) job_config.destination = TableReference( DatasetReference(dest_project, dest_dataset), dest_table) # destination_encryption_configuration if "destination_encryption_configuration" in query: job_config.destination_encryption_configuration = EncryptionConfiguration( query["destination_encryption_configuration"]) # dry_run if "dry_run" in query: try: job_config.dry_run = parse_boolean(query["dry_run"]) except ValueError: raise ValueError("invalid boolean in url query for dry_run: " + query["dry_run"]) # labels if "labels" in query: label_groups = GROUP_DELIMITER.split(query["labels"]) labels = {} for label_group in label_groups: try: key, value = KEY_VALUE_DELIMITER.split(label_group) except ValueError: raise ValueError("malformed url query in labels: " + label_group) labels[key] = value job_config.labels = labels # maximum_bytes_billed if "maximum_bytes_billed" in query: try: job_config.maximum_bytes_billed = int( query["maximum_bytes_billed"]) except ValueError: raise ValueError( "invalid int in url query maximum_bytes_billed: " + query["maximum_bytes_billed"]) # priority if "priority" in query: try: job_config.priority = getattr(QueryPriority, query["priority"]) except AttributeError: raise ValueError("invalid priority in url query: " + query["priority"]) # query_parameters if "query_parameters" in query: raise NotImplementedError("url query query_parameters not implemented") # schema_update_options if "schema_update_options" in query: schema_update_options = GROUP_DELIMITER.split( query["schema_update_options"]) try: job_config.schema_update_options = [ getattr(SchemaUpdateOption, schema_update_option) for schema_update_option in schema_update_options ] except AttributeError: raise ValueError("invalid schema_update_options in url query: " + query["schema_update_options"]) # table_definitions if "table_definitions" in query: raise NotImplementedError( "url query table_definitions not implemented") # time_partitioning if "time_partitioning" in query: raise NotImplementedError( "url query time_partitioning not implemented") # udf_resources if "udf_resources" in query: raise NotImplementedError("url query udf_resources not implemented") # use_query_cache if "use_query_cache" in query: try: job_config.use_query_cache = parse_boolean( query["use_query_cache"]) except ValueError: raise ValueError( "invalid boolean in url query for use_query_cache: " + query["use_query_cache"]) # write_disposition if "write_disposition" in query: try: job_config.write_disposition = getattr(WriteDisposition, query["write_disposition"]) except AttributeError: raise ValueError("invalid write_disposition in url query: " + query["write_disposition"]) return project_id, location, dataset_id, arraysize, credentials_path, job_config
from datetime import datetime, timedelta, date from google.cloud import bigquery from google.cloud.bigquery.table import RowIterator from google.cloud.bigquery import QueryJobConfig billing_project = "momovn-dev" conf = QueryJobConfig() conf.use_query_cache = True conf.use_legacy_sql = False checkpointDate = None start_date = datetime.strptime('20201002', '%Y%m%d').date() end_date = datetime.strptime('20201002', '%Y%m%d').date() day_count = (end_date - start_date).days + 1 for checkpointDate in (start_date + timedelta(n) for n in range(day_count)): try: # checkpointDate = datetime.strptime(single_date, '%Y%m%d').date() checkpointDateWithoutDash = checkpointDate.strftime("%Y%m%d") checkpointDateWithDash = checkpointDate.strftime("%Y-%m-%d") query = f"""WITH A AS( SELECT GPS.reference PHONE FROM `momovn-prod.HERMES.HERMES_LOCATIONS` GPS WHERE DATE(GPS.event_timestamp,'Asia/Bangkok') = {checkpointDateWithDash}) SELECT COUNT(DISTINCT T1.USER_ID), 'HERMES LOCATION' FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 LEFT JOIN A T2 ON T1.USER_ID = T2.PHONE WHERE T2.PHONE IS NULL UNION ALL SELECT COUNT(DISTINCT T1.USER_ID), 'USER_LOCATION' FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 LEFT JOIN `momovn-prod.HERMES.USER_LOCATIONS_{checkpointDateWithoutDash}` T2 ON T1.USER_ID = T2.USER_ID
def enrich_task(): client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)