def gather_file_properties(self): """Gathers properties of the files loaded into the benchmark table. """ # gather file properties from the files' path # pylint: disable=line-too-long benchmark_details_pattern = \ r'fileType=(\w+)/compression=(\w+)/numColumns=(\d+)/columnTypes=(\w+)/numFiles=(\d+)/tableSize=(\w+)' self.file_type, compression, self.num_columns, self.column_types, \ num_files, table_size = \ re.findall(benchmark_details_pattern, self.path)[0] self.compression_format = ( file_constants.FILE_CONSTANTS['compressionFormats'][compression]) # get schema from the staging table that the file was generated from source_staging_table_name = '{0:s}_{1:s}'.format( self.column_types, self.num_columns) source_staging_table_util = table_util.TableUtil( source_staging_table_name, self.staging_dataset_id, project=self.staging_project, ) if self.file_type == 'parquet' or self.file_type == 'avro': self.bq_schema = None else: self.bq_schema = source_staging_table_util.table.schema
def _set_job_properties(self): query_properties = dict() query_properties['query'] = self.bql query_properties['queryCategory'] = self.query_category # get properties from job query_properties['totalBytesBilled'] = self.job.total_bytes_billed query_properties[ 'totalBytesProcessed'] = self.job.total_bytes_processed # get main table properties main_table_properties = dict() main_table_properties['tableType'] = self.table_type main_table_properties['tableName'] = self.main_table_name main_table_util = table_util.TableUtil(self.main_table_name, self.table_dataset_id) main_table_util.set_table_properties() main_table_properties['equivalentBqTableSize'] = \ main_table_util.size_in_mb main_table_properties['fileURI'] = self.file_uri # get properties from file path properties_from_file_path = self._get_properties_from_file_path( self.file_uri) main_table_properties.update(properties_from_file_path) del main_table_properties['stagingDataSize'] query_properties['mainTable'] = main_table_properties self.results_dict['queryProperties'] = query_properties
def __init__(self, table_id, dataset_id): self.table_id = table_id self.dataset_id = dataset_id self.bq_table_util = table_util.TableUtil(self.table_id, self.dataset_id) self.bq_table_util.set_table_properties() self.schema = self.bq_table_util.table.schema self.query_strings = {}
def run_federated_query(self, query_type, query): """Runs native queries on EXTERNAL files Args: query_type(str): Code for the category of the query to run (SIMPLE_SELECT_*, SELECT_ONE_STRING, SELECT_50_PERCENT). query(str): The query to run. """ file_formats = file_constants.FILE_CONSTANTS['sourceFormats'] source_format = file_formats[self.file_type] external_config = bigquery.ExternalConfig(source_format=source_format) external_config.source_uris = [self.file_uri + '/*'] if source_format != 'AVRO' and source_format != 'PARQUET': main_table_util = table_util.TableUtil(self.native_table_id, self.dataset_id) external_config.schema = main_table_util.table.schema if source_format == 'CSV': external_config.options.skip_leading_rows = 1 external_config.compression = self.compression.upper() table_id = self.native_table_id + '_external' results_destination = '{0:s}.{1:s}.{2:s}_query_results'.format( self.bq_project, self.dataset_id, table_id) logging.info( 'Storing query results in {0:s}'.format(results_destination)) job_config = bigquery.QueryJobConfig( table_definitions={table_id: external_config}, use_legacy_sql=False, allow_large_results=True, destination=results_destination) bql = query.format(table_id) print(bql) query_job = self.bq_client.query(bql, job_config=job_config) logging.info("Running external {0:s} query.".format(query_type)) query_job.result() query_result = benchmark_result_util.QueryBenchmarkResultUtil( job=query_job, job_type=self.job_type, benchmark_name=self.benchmark_name, project_id=self.bq_project, results_table_name=self.results_table_name, results_dataset_id=self.results_table_dataset_id, bq_logs_dataset=self.bq_logs_dataset_id, bql=bql, query_category=query_type, main_table_name=self.native_table_id, table_dataset_id=self.dataset_id, table_type=EXTERNAL_TYPE_ID, file_uri=self.file_uri, ) query_result.insert_results_row() self.bq_client.delete_table(results_destination) logging.info('Deleting results destination table {0:s}'.format( results_destination))
def create_table(self): """Creates the bencmark table in BigQuery. The method creates an empty table using the schema from the staging table that the files were generated from. It uses the current timestamp to name the benchmark table to create a random, unique name. """ self.job_destination_table = '{0:d}'.format(int(time.time())) self.benchmark_table_util = table_util.TableUtil( self.job_destination_table, self.dataset_id, bq_schema=self.bq_schema, ) self.benchmark_table_util.create_table()
def _set_job_properties(self): """Sets load specific properties.""" load_properties = {} # get properties from benchmark table benchmark_table_util = table_util.TableUtil(self.load_table_id, self.load_dataset_id) benchmark_table_util.set_table_properties() load_properties['numRows'] = benchmark_table_util.table.num_rows # get properties from the load job load_properties['numFiles'] = self.job.input_files load_properties['sourceFormat'] = self.job.source_format # get properties from file # pylint: disable=line-too-long benchmark_details_pattern = \ r'gs://([\w\'-]+)/fileType=(\w+)/compression=(\w+)/numColumns=(\d+)/columnTypes=(\w+)/numFiles=(\d+)/tableSize=(\d+)(\w+)' bucket_name, file_type, compression, num_columns, column_types, \ expected_num_files, staging_data_size, staging_data_unit = \ re.findall(benchmark_details_pattern, self.job_source_uri)[0] compression_format = ( file_constants.FILE_CONSTANTS['compressionFormats'][compression]) file_name_prefix = 'fileType={0:s}/compression={1:s}/numColumns={2:s}/columnTypes={3:s}/numFiles={4:s}/tableSize={5:s}{6:s}'.format( file_type, compression, num_columns, column_types, expected_num_files, staging_data_size, staging_data_unit) bucket = self.storage_client.get_bucket(bucket_name) files_consts = file_constants.FILE_CONSTANTS if compression == 'none': file_ext = file_type else: file_ext = files_consts['compressionExtensions'][compression] file_name = '{0:s}/file1.{1:s}'.format(file_name_prefix, file_ext) file_size = float(bucket.get_blob(file_name).size) / BYTES_IN_MB load_properties['fileType'] = file_type load_properties['compressionType'] = compression_format load_properties['numColumns'] = num_columns load_properties['columnTypes'] = column_types load_properties['fileSize'] = file_size load_properties['stagingDataSize'] = staging_data_size load_properties['destinationTable'] = '{0:s}.{1:s}.{2:s}'.format( self.project_id, self.load_table_id, self.load_dataset_id) load_properties['sourceURI'] = self.job_source_uri self.results_dict['loadProperties'] = load_properties
def _set_job_properties(self): """Sets load specific properties.""" load_properties = dict() load_properties['destinationTable'] = '{0:s}.{1:s}.{2:s}'.format( self.project_id, self.load_dataset_id, self.load_table_id) load_properties['sourceURI'] = self.job_source_uri # get properties from benchmark table benchmark_table_util = table_util.TableUtil(self.load_table_id, self.load_dataset_id) benchmark_table_util.set_table_properties() load_properties['numRows'] = benchmark_table_util.table.num_rows # get properties from the load job load_properties['sourceFormat'] = self.job.source_format # get properties from file properties_from_file_path = self._get_properties_from_file_path( self.job_source_uri) load_properties.update(properties_from_file_path) self.results_dict['loadProperties'] = load_properties
def create_files(self): """Creates all file combinations and store in GCS. Generates list of file combination from parameters in file_parameters.FILE_PARAMETERS dictionary, and creates each file in the list, as long as it doesn't yet exist in self.bucket. While each file is generated from a BigQuery staging table and stored in GCS, the method of creating the file varies depending on the parameters in the combination. """ # Gather file parameters and constants. files_consts = file_constants.FILE_CONSTANTS file_types = self.file_params['fileType'] extract_formats = files_consts['extractFormats'] file_compression_types = self.file_params['fileCompressionTypes'] file_counts = self.file_params['numFiles'] # Begin the process of iterating through each combination. logging.info('Starting to create files by exporting staging tables to ' 'bucket {0:s}'.format(self.bucket_name)) skip_message = 'Skipped path and its subsequent files: {0:s}' # Gather a list of the staging tables. The staging tables already # include the columnTypes, numColumns, and stagingDataSizes # parameters (ex: the staging table 100_STRING_10_10MB has # columnType=100_STRING, numColumns=10, and stagingDataSizes=10MB). tables = self.primitive_staging_tables if len(tables) == 0: logging.info('Dataset {0:s} contains no tables. Please create ' 'staging tables in {0:s}.'.format( self.primitive_staging_dataset_id ) ) # For each staging table, extract to each fileType, each # compressionType, and copy each file so that the combination has # the correct numFiles. for (table_list_item, file_type, num_files) in \ itertools.product(tables, file_types, file_counts): for compression_type in file_compression_types[file_type]: staging_table_util = table_util.TableUtil( table_list_item.table_id, table_list_item.dataset_id, ) staging_table_util.set_table_properties() gcs_prefix = 'gs://{0:s}/'.format(self.bucket_name) dest_string = ('fileType={0:s}/' 'compression={1:s}/' 'numColumns={2:d}/' 'columnTypes={3:s}/' 'numFiles={4:d}/' 'tableSize={5:d}MB/') destination_path = dest_string.format( file_type, compression_type, staging_table_util.num_columns, staging_table_util.column_types, num_files, int(staging_table_util.table_size/1000000), ) if compression_type == 'none': extension = file_type else: extensions = (files_consts ['compressionExtensions']) extension = extensions[compression_type] file_string = 'file1' destination_prefix = '{0:s}{1:s}{2:s}'.format( gcs_prefix, destination_path, file_string, ) if num_files == 1: # If the number of files in the current combination is 1, # check to see if the one file doesn't yet exist. blob_name = '{0:s}{1:s}.{2:s}'.format( destination_path, file_string, extension, ) if not storage.Blob( bucket=self.bucket, name=blob_name, ).exists(self.gcs_client): # If the one file doesn't yet exist, it needs to be # created. The method of creation depends on the file # type. if file_type == 'parquet': # If the file type is parquet, use the # _create_parquet_files() method, which uses # DataFlow for file creation. self._create_parquet_file( blob_name, staging_table_util, destination_prefix, ) else: # Otherwise, use the_extract_tables_to_files() # method, which uses BigQuery extract jobs. destination_format = extract_formats[ file_type] self._extract_tables_to_files( blob_name, compression_type, destination_format, destination_prefix, extension, staging_table_util, ) else: # If the one file one file already exists, # skip its creation. logging.info(skip_message.format(blob_name)) else: # If the numFiles parameter in the current iteration is not # 1, that means multiple files need to be created for the # combination. In this case, obtain the file from the # combination in which all parameters are identical to the # current combination, except in which numFiles=1. file will # be used to make copies for the combinations where numFiles # > 1, since copying files is faster than running a new # extraction or DataFlow job. For example, if the current # combination is fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=100/tableSize=10MB/ # pylint: disable=line-too-long # then fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=1/tableSize=10MB/file1.csv # pylint: disable=line-too-long # will be used to make copies for the current combination. file1_destination_path = dest_string.format( file_type, compression_type, staging_table_util.num_columns, staging_table_util.column_types, 1, int(staging_table_util.table_size / 1000000), ) file1_blob_name = '{0:s}{1:s}.{2:s}'.format( file1_destination_path, file_string, extension, ) # Before making copies for the current combination, check # that the first file in the combination doesn't yet exist. # If it doesn't, then proceed. If it does exist, assume that # all other files in the combination already exist too. # While this can be a risky assumption, it saves a lot of # time, since combinations can contain as many as 10000 # files. If a combination is stopped in the middle of # generating a large number of files, the restart_ # incomplete_combination() method can be used to ensure the # combination gets completed without taking the time to # check each file's existence here. first_of_n_blobs = '{0:s}{1:s}.{2:s}'.format( destination_path, file_string, extension, ) if not storage.Blob( bucket=self.bucket, name=first_of_n_blobs, ).exists(self.gcs_client): # If the first file in the combination doesn't exist, # run copy_blobs() to create each file in the # combination. start_num = 1 self.copy_blobs( file1_blob_name, destination_path, extension, start_num, num_files, ) else: # Otherwise, skip creating the first file and all # subsequent files in the combination. logging.info(skip_message.format( first_of_n_blobs ))
def main(argv=None): args = parse_args(argv) create_results_table = args.create_results_table create_benchmark_schemas = args.create_benchmark_schemas benchmark_table_schemas_dir = args.benchmark_table_schemas_directory create_staging_tables = args.create_staging_tables create_files = args.create_files restart_file = args.restart_file create_benchmark_tables = args.create_benchmark_tables duplicate_benchmark_tables = args.duplicate_benchmark_tables bq_project_id = args.bq_project_id benchmark_dataset_id = args.benchmark_dataset_id staging_project_id = args.staging_project_id staging_dataset_id = args.staging_dataset_id resized_staging_dataset_id = args.resized_staging_dataset_id results_table_name = args.results_table_name results_dataset_id = args.results_dataset_id results_table_schema_path = args.results_table_schema_path gcs_project_id = args.gcs_project_id bucket_name = args.bucket_name dataflow_temp_location = args.dataflow_temp_location dataflow_staging_location = args.dataflow_temp_location bq_logs_dataset = args.bq_logs_dataset file_params = load_file_parameters.FILE_PARAMETERS # Run provided commands if create_results_table: logging.info('Creating results table {0:s} from schema in ' '{1:s}.'.format( results_table_name, results_table_schema_path, )) results_table_util = table_util.TableUtil( table_id=results_table_name, dataset_id=results_dataset_id, json_schema_filename=results_table_schema_path, ) results_table_util.create_table() logging.info('Done creating results table.') if create_benchmark_schemas: benchmark_schema_creator = schema_creator.SchemaCreator( schemas_dir=benchmark_table_schemas_dir, file_params=file_params) benchmark_schema_creator.create_schemas() if create_staging_tables: benchmark_staging_table_generator = ( staging_table_generator.StagingTableGenerator( project=bq_project_id, staging_dataset_id=staging_dataset_id, resized_dataset_id=resized_staging_dataset_id, json_schema_path=benchmark_table_schemas_dir, file_params=file_params, num_rows=500)) benchmark_staging_table_generator.create_staging_tables( dataflow_staging_location=dataflow_staging_location, dataflow_temp_location=dataflow_staging_location, ) benchmark_staging_table_generator.create_resized_tables() if create_files: benchmark_load_file_generator = load_file_generator.FileGenerator( project_id=gcs_project_id, primitive_staging_dataset_id=resized_staging_dataset_id, bucket_name=bucket_name, file_params=file_params, dataflow_staging_location=dataflow_staging_location, dataflow_temp_location=dataflow_temp_location, ) if restart_file: benchmark_load_file_generator.restart_incomplete_combination( restart_file) benchmark_load_file_generator.create_files() if create_benchmark_tables: benchmark_tables_processor = load_tables_processor.LoadTablesProcessor( benchmark_name=BENCHMARK_NAME, bq_project=bq_project_id, gcs_project=gcs_project_id, staging_project=staging_project_id, staging_dataset_id=staging_dataset_id, dataset_id=benchmark_dataset_id, bucket_name=bucket_name, results_table_name=results_table_name, results_table_dataset_id=results_dataset_id, duplicate_benchmark_tables=duplicate_benchmark_tables, file_params=file_params, bq_logs_dataset=bq_logs_dataset, ) benchmark_tables_processor.create_benchmark_tables()