def _get_input_dimensions(known_args, pipeline_args): pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = beam_pipeline_options.view_as( pipeline_options.GoogleCloudOptions) estimate_sizes_job_name = pipeline_common.generate_unique_name( _ESTIMATE_SIZES_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + estimate_sizes_job_name else: google_cloud_options.job_name = estimate_sizes_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_estimated_input_size_file_name = '-'.join( [google_cloud_options.job_name, _ESTIMATE_SIZES_FILE_NAME]) temp_estimated_input_size_file_path = filesystems.FileSystems.join( temp_directory, temp_estimated_input_size_file_name) with beam.Pipeline(options=beam_pipeline_options) as p: estimates = pipeline_common.get_estimates(p, pipeline_mode, known_args.all_patterns) files_size = (estimates | 'GetFilesSize' >> extract_input_size.GetFilesSize()) file_count = (estimates | 'CountAllFiles' >> beam.combiners.Count.Globally()) sample_map = (estimates | 'ExtractSampleMap' >> extract_input_size.GetSampleMap()) estimated_value_count = (sample_map | extract_input_size.GetEstimatedValueCount()) estimated_sample_count = (sample_map | extract_input_size.GetEstimatedSampleCount()) estimated_variant_count = ( estimates | 'GetEstimatedVariantCount' >> extract_input_size.GetEstimatedVariantCount()) _ = (estimated_variant_count | beam.ParDo(extract_input_size.print_estimates_to_file, beam.pvalue.AsSingleton(estimated_sample_count), beam.pvalue.AsSingleton(estimated_value_count), beam.pvalue.AsSingleton(files_size), beam.pvalue.AsSingleton(file_count), temp_estimated_input_size_file_path)) with filesystems.FileSystems.open( temp_estimated_input_size_file_path) as f: estimates = f.readlines() if len(estimates) != 5: raise ValueError('Exactly 5 estimates were expected in {}.'.format( temp_estimated_input_size_file_path)) known_args.estimated_variant_count = int(estimates[0].strip()) known_args.estimated_sample_count = int(estimates[1].strip()) known_args.estimated_value_count = int(estimates[2].strip()) known_args.files_size = int(estimates[3].strip()) known_args.file_count = int(estimates[4].strip())
def test_get_file_sizes(self): vcf_estimates = self._create_vcf_estimates() pipeline = TestPipeline() size = (pipeline | transforms.Create(vcf_estimates) | 'GetFilesSize' >> extract_input_size.GetFilesSize()) assert_that(size, equal_to([600])) pipeline.run()