def Cleanup(benchmark_spec): storage_service = object_storage_service.GetObjectStorageClass( FLAGS.cloud)() base_folder = benchmark_spec.uuid.split('-')[0] for lifecycle_step, _ in RESOURCE_LIFECYCLE_ARTIFACTS.items(): dml_script_folder = '{}_{}'.format(base_folder, lifecycle_step) storage_service.DeleteBucket(dml_script_folder)
def Prepare(benchmark_spec): """Prepare vm with cloud provider tool and prepare vm with data file. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. """ providers.LoadProvider(FLAGS.storage) service = object_storage_service.GetObjectStorageClass(FLAGS.storage)() service.PrepareService(FLAGS.object_storage_region) vms = benchmark_spec.vms vm_util.RunThreaded(lambda vm: PrepareVM(vm, service), vms) # We would like to always cleanup server side states when exception happens. benchmark_spec.always_call_cleanup = True # Make the bucket(s) bucket_name = 'pkb%s' % FLAGS.run_uri if FLAGS.storage != 'GCP' or not FLAGS.object_storage_gcs_multiregion: service.MakeBucket(bucket_name) else: # Use a GCS multiregional bucket multiregional_service = gcs.GoogleCloudStorageService() multiregional_service.PrepareService( FLAGS.object_storage_gcs_multiregion or DEFAULT_GCS_MULTIREGION) multiregional_service.MakeBucket(bucket_name) # Save the service and the buckets for later benchmark_spec.service = service benchmark_spec.buckets = [bucket_name]
def _GetService() -> object_storage_service.ObjectStorageService: """Get a ready to use instance of ObjectStorageService.""" # TODO(pclay): consider using FLAGS.storage to allow cross cloud testing? cloud = FLAGS.cloud providers.LoadProvider(cloud) service = object_storage_service.GetObjectStorageClass(cloud)() # This method is idempotent with default args and safe to call in each phase. service.PrepareService(FLAGS.object_storage_region) return service
def Prepare(benchmark_spec): """Installs and sets up dataset on the Spark clusters. Copies scripts and all the queries to cloud. Creates external Hive tables for data (unless BigQuery is being used). Args: benchmark_spec: The benchmark specification """ dpb_service_instance = benchmark_spec.dpb_service run_uri = benchmark_spec.uuid.split('-')[0] dpb_service_instance.CreateBucket(run_uri) temp_run_dir = temp_dir.GetRunDirPath() spark_sql_perf_dir = os.path.join(temp_run_dir, 'spark_sql_perf_dir') vm_util.IssueCommand(['git', 'clone', SPARK_SQL_PERF_GIT, spark_sql_perf_dir]) vm_util.IssueCommand(['git', 'checkout', SPARK_SQL_PERF_GIT_COMMIT], cwd=spark_sql_perf_dir) query_dir = os.path.join(spark_sql_perf_dir, 'src', 'main', 'resources', FLAGS.dpb_sparksql_query) storage_service = object_storage_service.GetObjectStorageClass(FLAGS.cloud)() dst_url = '{prefix}{uri}'.format( prefix=dpb_service_instance.PERSISTENT_FS_PREFIX, uri=run_uri) for dir_name, _, files in os.walk(query_dir): for filename in files: match = re.match(r'q?([0-9]+)a?.sql', filename) if match: query_id = match.group(1) # if order is specified only upload those queries if not FLAGS.dpb_sparksql_order or query_id in FLAGS.dpb_sparksql_order: query = '{}.sql'.format(query_id) src_url = os.path.join(dir_name, filename) storage_service.Copy(src_url, os.path.join(dst_url, query)) for script in [SPARK_TABLE_SCRIPT, SPARK_SQL_RUNNER_SCRIPT]: src_url = data.ResourcePath(script) storage_service.Copy(src_url, dst_url) benchmark_spec.base_dir = dst_url # Create external Hive tables if not reading the data from BigQuery if FLAGS.dpb_sparksql_data: stdout = storage_service.List(FLAGS.dpb_sparksql_data) for table_dir in stdout.split('\n'): # The directory name is the table name. if not table_dir: continue table = re.split(' |/', table_dir.rstrip('/')).pop() stats = dpb_service_instance.SubmitJob( pyspark_file=os.path.join(dst_url, SPARK_TABLE_SCRIPT), job_type=BaseDpbService.PYSPARK_JOB_TYPE, job_arguments=[FLAGS.dpb_sparksql_data, table]) logging.info(stats) if not stats['success']: logging.warning('Creates table %s from %s failed', table, table_dir)
def Prepare(benchmark_spec): """Prepare vm with cloud provider tool and prepare vm with data file. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. """ providers.LoadProvider(FLAGS.storage) service = object_storage_service.GetObjectStorageClass(FLAGS.storage)() service.PrepareService(FLAGS.object_storage_region) vms = benchmark_spec.vms for vm in vms: PrepareVM(vm, service) service.PrepareVM(vm) # We would like to always cleanup server side states when exception happens. benchmark_spec.always_call_cleanup = True # Make the bucket(s) bucket_name = 'pkb%s' % FLAGS.run_uri if FLAGS.storage != 'GCP': service.MakeBucket(bucket_name) buckets = [bucket_name] else: # TODO(nlavine): make GCP bucket name handling match other # providers. Leaving it inconsistent for now to match previous # behavior, but should change it after a reasonable deprecation # period. multiregional_service = gcs.GoogleCloudStorageService() multiregional_service.PrepareService( FLAGS.object_storage_gcs_multiregion or DEFAULT_GCS_MULTIREGION) multiregional_service.MakeBucket(bucket_name) region = FLAGS.object_storage_region or gcs.DEFAULT_GCP_REGION regional_bucket_name = 'pkb%s-%s' % (FLAGS.run_uri, region) regional_service = gcs.GoogleCloudStorageService() regional_service.PrepareService(region) regional_service.MakeBucket(regional_bucket_name) buckets = [bucket_name, regional_bucket_name] # Save the service and the buckets for later benchmark_spec.service = service benchmark_spec.buckets = buckets
def Prepare(benchmark_spec): """Prepare phase uses schema creation script and sample data to prepare table. Args: benchmark_spec: Configuration that holds the definition and instance details of the resources used for benchmarking. """ storage_service = object_storage_service.GetObjectStorageClass(FLAGS.cloud)() dpb_service_instance = benchmark_spec.dpb_service run_uri = benchmark_spec.uuid.split('-')[0] uri_map = ManageLifecycleResources(run_uri, dpb_service_instance, storage_service) dml_script_uri = uri_map['dml_script'] data_folder_uri = uri_map['data'] stats = dpb_service_instance.SubmitJob( pyspark_file=dml_script_uri, job_type=BaseDpbService.PYSPARK_JOB_TYPE, job_arguments=[data_folder_uri]) logging.info(stats) if not stats['success']: logging.warning('Table Creation Failed')