def test_sample_to_example(self): expected = """features { feature { key: "gender" value { int64_list { value: 1 } } } feature { key: "gender_string" value { bytes_list { value: "female" } } } feature { key: "population" value { int64_list { value: -1 } } } feature { key: "population_string" value { bytes_list { value: "some pop not in the training labels" } } } feature { key: "sample_name" value { bytes_list { value: "sample1" } } } feature { key: "super_population" value { int64_list { value: 4 } } } feature { key: "super_population_string" value { bytes_list { value: "SAS" } } } feature { key: "variants_9" value { int64_list { value: -5153783975271321865 } } } } """ variants_to_features_fn = variant_encoder.build_variants_to_features( variant_to_feature_name_fn=variant_encoder. variant_to_contig_feature_name, variant_to_words_fn=variant_encoder.build_variant_to_words( add_hethom=False)) sample_to_example = encoder.build_sample_to_example( metadata_to_features_fn=metadata_encoder. metadata_to_ancestry_features, variants_to_features_fn=variants_to_features_fn) self.assertEqual( expected, str( sample_to_example(SAMPLE_ID, [HETEROZYGOUS_VARIANT_CALL], SAMPLE_METADATA)))
def run(argv=None): """Runs the variant preprocess pipeline. Args: argv: Pipeline options as a list of arguments. """ pipeline_options = PipelineOptions(flags=argv) preprocess_options = pipeline_options.view_as(PreprocessOptions) cloud_options = pipeline_options.view_as(GoogleCloudOptions) output_dir = os.path.join( preprocess_options.output, datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging') cloud_options.temp_location = os.path.join(output_dir, 'tmp') cloud_options.job_name = 'preprocess-varianteatures-%s' % ( datetime.datetime.now().strftime('%y%m%d-%H%M%S')) metadata_query = str( Template(open(preprocess_options.metadata, 'r').read()).render(METADATA_QUERY_REPLACEMENTS)) logging.info('metadata query : %s', metadata_query) data_query = str( Template(open(preprocess_options.input, 'r').read()).render(DATA_QUERY_REPLACEMENTS)) logging.info('data query : %s', data_query) # Assemble the strategies to be used to convert the raw data to features. variant_to_feature_name_fn = variant_encoder.variant_to_contig_feature_name if preprocess_options.bin_size is not None: variant_to_feature_name_fn = variant_encoder.build_variant_to_binned_feature_name( bin_size=preprocess_options.bin_size) variants_to_features_fn = variant_encoder.build_variants_to_features( variant_to_feature_name_fn=variant_to_feature_name_fn, variant_to_words_fn=variant_encoder.build_variant_to_words( add_hethom=preprocess_options.add_hethom)) sample_to_example_fn = encoder.build_sample_to_example( metadata_to_features_fn=metadata_encoder.metadata_to_ancestry_features, variants_to_features_fn=variants_to_features_fn) with beam.Pipeline(options=pipeline_options) as p: # Gather our sample metadata into a python dictionary. samples_metadata = ( p | 'ReadSampleMetadata' >> beam.io.Read( beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True)) | 'TableToDictionary' >> beam.CombineGlobally( util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN))) # Read the table rows into a PCollection. rows = p | 'ReadVariants' >> beam.io.Read( beam.io.BigQuerySource(query=data_query, use_standard_sql=True)) # Convert the data into TensorFlow Example Protocol Buffers. examples = variants_to_examples( rows, samples_metadata, sample_to_example_fn=sample_to_example_fn) # Write the serialized compressed protocol buffers to Cloud Storage. _ = (examples | 'EncodeExamples' >> beam.Map(lambda example: example.SerializeToString()) | 'WriteExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(output_dir, 'examples'), compression_type=CompressionTypes.GZIP, file_name_suffix='.tfrecord.gz'))