def load_task(ds, **kwargs): client = bigquery.Client() job_config = bigquery.LoadJobConfig() schema_path = os.path.join( dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task)) schema = read_bigquery_schema_from_file(schema_path) schema = adjust_schema_for_kovan(dag_id, task, schema) job_config.schema = schema job_config.source_format = bigquery.SourceFormat.CSV if file_format == 'csv' else bigquery.SourceFormat.NEWLINE_DELIMITED_JSON if file_format == 'csv': job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.allow_quoted_newlines = allow_quoted_newlines job_config.ignore_unknown_values = True export_location_uri = 'gs://{bucket}/export'.format( bucket=output_bucket) if load_all_partitions: uri = '{export_location_uri}/{task}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, file_format=file_format) else: uri = '{export_location_uri}/{task}/block_date={ds}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, ds=ds, file_format=file_format) table_ref = client.dataset(dataset_name_raw).table(task) load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE'
def seed_task(): client = bigquery.Client() job_config = bigquery.LoadJobConfig() schema_path = os.path.join(dags_folder, 'resources/stages/seed/schemas/{task}.json'.format(task=task)) job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = bigquery.SourceFormat.CSV job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.ignore_unknown_values = True file_path = os.path.join(dags_folder, 'resources/stages/seed/data/{task}.csv'.format(task=task)) table_ref = client.dataset(project='blockchain-etl-internal', dataset_id='common').table(task) load_job = client.load_table_from_file(open(file_path, mode='r+b'), table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE'
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = bigquery.Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format(task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table(temp_table_name) schema_path = os.path.join(dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format(task=task)) schema = read_bigquery_schema_from_file(schema_path) table = bigquery.Table(temp_table_ref, schema=schema) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format(task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning(field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = bigquery.QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join(dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions or always_load_all_partitions: # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset(dataset_name, project=destination_dataset_project_id).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge/merge_{task}.sql'.format(task=task)) merge_sql_template = read_file(merge_sql_path) merge_template_context = template_context.copy() merge_template_context['params']['source_table'] = temp_table_name merge_template_context['params']['destination_dataset_project_id'] = destination_dataset_project_id merge_template_context['params']['destination_dataset_name'] = dataset_name merge_sql = kwargs['task'].render_template('', merge_sql_template, merge_template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def create_or_update_history_table(bigquery_client, dataset_name, history_table_name, table_definition, ds, public_project_id, public_dataset_name, internal_project_id, destination_project_id, sqls_folder, parse_all_partitions, time_func=time.time): table_name = table_definition['table']['table_name'] schema = table_definition['table']['schema'] parser_type = table_definition['parser'].get('type', 'log') schema = read_bigquery_schema_from_dict(schema, parser_type) # # # Create a temporary table dataset_name_temp = 'parse_temp' create_dataset(bigquery_client, dataset_name_temp) temp_table_name = 'temp_{table_name}_{milliseconds}' \ .format(table_name=table_name, milliseconds=int(round(time_func() * 1000))) temp_table_ref = bigquery_client.dataset(dataset_name_temp).table( temp_table_name) temp_table = bigquery.Table(temp_table_ref, schema=schema) table_description = table_definition['table']['table_description'] temp_table.description = table_description temp_table.time_partitioning = bigquery.TimePartitioning( field='block_timestamp') logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr())) temp_table = bigquery_client.create_table(temp_table) assert temp_table.table_id == temp_table_name # # # Query to temporary table udf_name = 'parse_{}'.format(table_name) selector = abi_to_selector(parser_type, table_definition['parser']['abi']) parse_mode = get_parse_mode(HistoryType.HISTORY, parse_all_partitions=parse_all_partitions) full_source_table_name = get_source_table( parser_type=parser_type, parse_mode=parse_mode, ds=ds, internal_project_id=internal_project_id, public_project_id=public_project_id, public_dataset_name=public_dataset_name, selector=selector) sql = generate_parse_sql_template( sqls_folder, parser_type, parse_mode, full_source_table_name=full_source_table_name, selector=selector, internal_project_id=internal_project_id, destination_project_id=destination_project_id, dataset_name=dataset_name, udf_name=udf_name, table_definition=table_definition, parse_all_partitions=parse_all_partitions, ds=ds) query(bigquery_client, sql, destination=temp_table_ref) # # # Copy / merge to destination if parse_all_partitions: # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dataset = create_dataset(bigquery_client, dataset_name, internal_project_id) dest_table_ref = dataset.table(history_table_name) copy_job = bigquery_client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Need to do update description as copy above won't respect the description in case destination table # already exists table = bigquery_client.get_table(dest_table_ref) table.description = table_description table = bigquery_client.update_table(table, ["description"]) assert table.description == table_description else: merge_sql = render_merge_template( sqls_folder, table_schema=schema, internal_project_id=internal_project_id, dataset_name=dataset_name, destination_table_name=history_table_name, dataset_name_temp=dataset_name_temp, source_table=temp_table_name, ds=ds) query(bigquery_client, merge_sql) # Delete temp table bigquery_client.delete_table(temp_table_ref)
def create_or_update_table_from_table_definition( bigquery_client, table_definition, ds, source_project_id, source_dataset_name, destination_project_id, sqls_folder, parse_all_partitions, airflow_task): dataset_name = 'ethereum_' + table_definition['table']['dataset_name'] table_name = table_definition['table']['table_name'] table_description = table_definition['table']['table_description'] schema = table_definition['table']['schema'] parser = table_definition['parser'] parser_type = parser.get('type', 'log') abi = json.dumps(parser['abi']) columns = [c.get('name') for c in schema] template_context = {} template_context['ds'] = ds template_context['params'] = {} template_context['params']['source_project_id'] = source_project_id template_context['params']['source_dataset_name'] = source_dataset_name template_context['params']['table_name'] = table_name template_context['params']['columns'] = columns template_context['params']['parser'] = parser template_context['params']['abi'] = abi if parser_type == 'log': template_context['params']['event_topic'] = abi_to_event_topic( parser['abi']) elif parser_type == 'trace': template_context['params']['method_selector'] = abi_to_method_selector( parser['abi']) template_context['params'][ 'struct_fields'] = create_struct_string_from_schema(schema) template_context['params']['parse_all_partitions'] = parse_all_partitions contract_address = parser['contract_address'] if not contract_address.startswith('0x'): contract_address_sql = replace_refs(contract_address, ref_regex, destination_project_id, dataset_name) template_context['params']['parser'][ 'contract_address_sql'] = contract_address_sql # # # Create a temporary table dataset_name_temp = 'parse_temp' create_dataset(bigquery_client, dataset_name_temp) temp_table_name = 'temp_{table_name}_{milliseconds}' \ .format(table_name=table_name, milliseconds=int(round(time.time() * 1000))) temp_table_ref = bigquery_client.dataset(dataset_name_temp).table( temp_table_name) temp_table = bigquery.Table(temp_table_ref, schema=read_bigquery_schema_from_dict( schema, parser_type)) temp_table.description = table_description temp_table.time_partitioning = bigquery.TimePartitioning( field='block_timestamp') logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr())) temp_table = bigquery_client.create_table(temp_table) assert temp_table.table_id == temp_table_name # # # Query to temporary table job_config = bigquery.QueryJobConfig() job_config.priority = bigquery.QueryPriority.INTERACTIVE job_config.destination = temp_table_ref sql_template = get_parse_sql_template(parser_type, sqls_folder) sql = airflow_task.render_template('', sql_template, template_context) logging.info(sql) query_job = bigquery_client.query(sql, location='US', job_config=job_config) submit_bigquery_job(query_job, job_config) assert query_job.state == 'DONE' # # # Copy / merge to destination if parse_all_partitions: # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dataset = create_dataset(bigquery_client, dataset_name, destination_project_id) dest_table_ref = dataset.table(table_name) copy_job = bigquery_client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Need to do update description as copy above won't repect the description in case destination table # already exists table = bigquery_client.get_table(dest_table_ref) table.description = table_description table = bigquery_client.update_table(table, ["description"]) assert table.description == table_description else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE merge_sql_template = get_merge_table_sql_template(sqls_folder) merge_template_context = template_context.copy() merge_template_context['params']['source_table'] = temp_table_name merge_template_context['params'][ 'destination_dataset_project_id'] = destination_project_id merge_template_context['params'][ 'destination_dataset_name'] = dataset_name merge_template_context['params'][ 'dataset_name_temp'] = dataset_name_temp merge_template_context['params']['columns'] = columns merge_sql = airflow_task.render_template('', merge_sql_template, merge_template_context) print('Merge sql:') print(merge_sql) merge_job = bigquery_client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table bigquery_client.delete_table(temp_table_ref)