def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the call names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) # TODO(allieychen): Modify the SQL query with the specified call_names. query = _get_bigquery_query(known_args, schema) logging.info('Processing BigQuery query %s:', query) bq_source = bigquery.BigQuerySource(query=query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) if known_args.call_names: call_names = (p | transforms.Create(known_args.call_names) | beam.combiners.ToList()) else: call_names = ( variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner( known_args.preserve_call_names_order)) _ = (call_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_call_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines())
def run_pipeline(pipeline_args, known_args): """A beam pipeline to resize and pad images from urls and save to storage. Args: pipeline_args: Arguments consumed by the beam pipeline known_args: Extra args used to set various fields such as the dataset and table from which to read cat urls and labels, and the bucket and image directory to write processed images Returns: [nothing], just writes processed images to the image directory """ # Specify pipeline options pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Determine bigquery source from dataset and table arguments query = ('SELECT ROW_NUMBER() OVER() as index, original_url, label, randnum' ' from [' + known_args.dataset + '.' + known_args.table + ']') bq_source = bigquery.BigQuerySource(query=query) logging.info('Starting image collection into directory ' + known_args.output_dir) # Create destination directory if it doesn't exist output_dir = known_args.output_dir if known_args.cloud: output_dir = 'gs://' + known_args.storage_bucket + '/' + output_dir # Directory needs to be explicitly made on some filesystems. if not FileSystems.exists(output_dir): FileSystems.mkdirs(output_dir) # Run pipeline with beam.Pipeline(options=pipeline_options) as p: _ = (p | 'read_rows_from_cat_info_table' >> beam.io.Read(bq_source) | 'fetch_images_from_urls' >> beam.Map(fetch_image_from_url) | 'filter_bad_or_absent_images' >> beam.Filter(filter_bad_or_missing_image) | 'resize_and_pad_images' >> beam.Map(resize_and_pad, output_image_dim=known_args.output_image_dim) | 'write_images_to_storage' >> beam.Map(write_processed_image, output_dir=output_dir) ) logging.info('Done collecting images')
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str vcf_data_header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the data header to `vcf_data_header_file_path`. TODO(allieychen): Eventually, it also generates the meta information file. """ bq_source = bigquery.BigQuerySource( query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))), validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant()) call_names = ( variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) _ = (call_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_data_header, _VCF_FIXED_COLUMNS, vcf_data_header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines())
def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) bq_source = bigquery.BigQuerySource( query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))), validate=True, use_standard_sql=True) options = pipeline_options.PipelineOptions(pipeline_args) with beam.Pipeline(options=options) as p: _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant() | densify_variants.DensifyVariants() | vcfio.WriteToVcf(known_args.output_file))
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the sample names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) variant_query = _get_variant_query(known_args, schema) logging.info('Processing BigQuery query %s:', variant_query) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( known_args.input_table) bq_variant_source = bigquery.BigQuerySource(query=variant_query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) base_table_id = bigquery_util.get_table_base_name(table_id) sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format( PROJECT_ID=project_id, DATASET_ID=dataset_id, TABLE_NAME=bigquery_util.compose_table_name(base_table_id, SAMPLE_INFO_TABLE_SUFFIX)) bq_sample_source = bigquery.BigQuerySource(query=sample_query, validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) sample_table_rows = ( p | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source)) if known_args.sample_names: temp_sample_names = (p | transforms.Create(known_args.sample_names, reshuffle=False)) else: # Get sample names from sample IDs in the variants and sort. id_to_name_hash_table = (sample_table_rows | 'SampleIdToNameDict' >> sample_mapping_table.SampleIdToNameDict()) temp_sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner( known_args.preserve_sample_order)) temp_sample_names = ( temp_sample_ids | 'GetSampleNames' >> sample_mapping_table.GetSampleNames( beam.pvalue.AsSingleton(id_to_name_hash_table)) | 'CombineToList' >> beam.combiners.ToList() | 'SortSampleNames' >> beam.ParDo(sorted)) name_to_id_hash_table = ( sample_table_rows | 'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict()) sample_ids = (temp_sample_names | 'GetSampleIds' >> sample_mapping_table.GetSampleIds( beam.pvalue.AsSingleton(name_to_id_hash_table)) | 'CombineSortedSampleIds' >> beam.combiners.ToList()) sample_names = temp_sample_names | beam.combiners.ToList() _ = (sample_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))