def test_deepvariant_header(self, contigs, sample_names): header = dv_vcf_constants.deepvariant_header(contigs=contigs, sample_names=sample_names) self.assertCountEqual(header.contigs, contigs) self.assertCountEqual(header.sample_names, sample_names) self.assertGreater(len(header.filters), 0) self.assertGreater(len(header.infos), 0) self.assertGreater(len(header.formats), 0)
def test_deepvariant_header(self, contigs, sample_names): header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=sample_names) self.assertCountEqual(header.contigs, contigs) self.assertCountEqual(header.sample_names, sample_names) self.assertGreater(len(header.filters), 0) self.assertGreater(len(header.infos), 0) self.assertGreater(len(header.formats), 0)
def main(argv): del argv contigs = fasta.RefFastaReader(FLAGS.ref).header.contigs max_records = FLAGS.max_records if FLAGS.max_records >= 0 else None variants_iter = examples_to_variants(FLAGS.examples, max_records=max_records) if not FLAGS.sample_name: sample_name, variants_iter = peek_sample_name(variants_iter) else: sample_name = FLAGS.sample_name header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) with vcf.VcfWriter(FLAGS.output_vcf, header=header) as writer: for variant in variants_iter: variant.calls[0].call_set_name = sample_name logging.log_every_n(logging.INFO, 'Converted %s', FLAGS.log_every, variant_utils.variant_key(variant)) writer.write(variant)
def main(argv=()): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: postprocess_variants does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv)), errors.CommandLineError) del argv # Unused. if (not FLAGS.nonvariant_site_tfrecord_path) != ( not FLAGS.gvcf_outfile): errors.log_and_raise( 'gVCF creation requires both nonvariant_site_tfrecord_path and ' 'gvcf_outfile flags to be set.', errors.CommandLineError) proto_utils.uses_fast_cpp_protos_or_die() logging_level.set_from_flag() fasta_reader = fasta.IndexedFastaReader(FLAGS.ref, cache_size=_FASTA_CACHE_SIZE) contigs = fasta_reader.header.contigs paths = io_utils.maybe_generate_sharded_filenames(FLAGS.infile) # Read one CallVariantsOutput record and extract the sample name from it. # Note that this assumes that all CallVariantsOutput protos in the infile # contain a single VariantCall within their constituent Variant proto, and # that the call_set_name is identical in each of the records. record = tf_utils.get_one_example_from_examples_path( ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput) if record is None: raise ValueError('Cannot find any records in {}'.format( ','.join(paths))) sample_name = _extract_single_sample_name(record) header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) with tempfile.NamedTemporaryFile() as temp: postprocess_variants_lib.process_single_sites_tfrecords( contigs, paths, temp.name) independent_variants = _transform_call_variants_output_to_variants( input_sorted_tfrecord_path=temp.name, qual_filter=FLAGS.qual_filter, multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter, sample_name=sample_name) variant_generator = haplotypes.maybe_resolve_conflicting_variants( independent_variants) write_variants_to_vcf(variant_generator=variant_generator, output_vcf_path=FLAGS.outfile, header=header) # Also write out the gVCF file if it was provided. if FLAGS.nonvariant_site_tfrecord_path: nonvariant_generator = io_utils.read_shard_sorted_tfrecords( FLAGS.nonvariant_site_tfrecord_path, key=_get_contig_based_variant_sort_keyfn(contigs), proto=variants_pb2.Variant) with vcf.VcfReader(FLAGS.outfile) as variant_reader: lessthanfn = _get_contig_based_lessthan(contigs) gvcf_variants = (_transform_to_gvcf_record(variant) for variant in variant_reader.iterate()) merged_variants = merge_variants_and_nonvariants( gvcf_variants, nonvariant_generator, lessthanfn, fasta_reader) write_variants_to_vcf(variant_generator=merged_variants, output_vcf_path=FLAGS.gvcf_outfile, header=header)
def main(argv=()): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: postprocess_variants does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv)), errors.CommandLineError) del argv # Unused. if (not FLAGS.nonvariant_site_tfrecord_path) != (not FLAGS.gvcf_outfile): errors.log_and_raise( 'gVCF creation requires both nonvariant_site_tfrecord_path and ' 'gvcf_outfile flags to be set.', errors.CommandLineError) proto_utils.uses_fast_cpp_protos_or_die() logging_level.set_from_flag() fasta_reader = fasta.IndexedFastaReader( FLAGS.ref, cache_size=_FASTA_CACHE_SIZE) contigs = fasta_reader.header.contigs paths = sharded_file_utils.maybe_generate_sharded_filenames(FLAGS.infile) # Read one CallVariantsOutput record and extract the sample name from it. # Note that this assumes that all CallVariantsOutput protos in the infile # contain a single VariantCall within their constituent Variant proto, and # that the call_set_name is identical in each of the records. record = tf_utils.get_one_example_from_examples_path( ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput) if record is None: logging.info('call_variants_output is empty. Writing out empty VCF.') sample_name = dv_constants.DEFAULT_SAMPLE_NAME if FLAGS.sample_name: logging.info( '--sample_name is set in postprocess_variant. Using %s as the ' 'sample name.', FLAGS.sample_name) sample_name = FLAGS.sample_name variant_generator = iter([]) else: sample_name = _extract_single_sample_name(record) temp = tempfile.NamedTemporaryFile() start_time = time.time() postprocess_variants_lib.process_single_sites_tfrecords( contigs, paths, temp.name) logging.info('CVO sorting took %s minutes', (time.time() - start_time) / 60) logging.info('Transforming call_variants_output to variants.') independent_variants = _transform_call_variants_output_to_variants( input_sorted_tfrecord_path=temp.name, qual_filter=FLAGS.qual_filter, multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter, sample_name=sample_name, group_variants=FLAGS.group_variants, use_multiallelic_model=FLAGS.use_multiallelic_model) variant_generator = haplotypes.maybe_resolve_conflicting_variants( independent_variants) header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) use_csi = _decide_to_use_csi(contigs) start_time = time.time() if not FLAGS.nonvariant_site_tfrecord_path: logging.info('Writing variants to VCF.') write_variants_to_vcf( variant_iterable=variant_generator, output_vcf_path=FLAGS.outfile, header=header) if FLAGS.outfile.endswith('.gz'): build_index(FLAGS.outfile, use_csi) logging.info('VCF creation took %s minutes', (time.time() - start_time) / 60) else: logging.info('Merging and writing variants to VCF and gVCF.') lessthanfn = _get_contig_based_lessthan(contigs) with vcf.VcfWriter( FLAGS.outfile, header=header, round_qualities=True) as vcf_writer, \ vcf.VcfWriter( FLAGS.gvcf_outfile, header=header, round_qualities=True) \ as gvcf_writer: nonvariant_generator = tfrecord.read_shard_sorted_tfrecords( FLAGS.nonvariant_site_tfrecord_path, key=_get_contig_based_variant_sort_keyfn(contigs), proto=variants_pb2.Variant) merge_and_write_variants_and_nonvariants(variant_generator, nonvariant_generator, lessthanfn, fasta_reader, vcf_writer, gvcf_writer) if FLAGS.outfile.endswith('.gz'): build_index(FLAGS.outfile, use_csi) if FLAGS.gvcf_outfile.endswith('.gz'): build_index(FLAGS.gvcf_outfile, use_csi) logging.info('Finished writing VCF and gVCF in %s minutes.', (time.time() - start_time) / 60) if FLAGS.vcf_stats_report: outfile_base = _get_base_path(FLAGS.outfile) with vcf.VcfReader(FLAGS.outfile) as reader: vcf_stats.create_vcf_report( variants=reader.iterate(), output_basename=outfile_base, sample_name=sample_name, vcf_reader=reader) if record: temp.close()
def main(argv=()): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: postprocess_variants does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv)), errors.CommandLineError) del argv # Unused. if (not FLAGS.nonvariant_site_tfrecord_path) != (not FLAGS.gvcf_outfile): errors.log_and_raise( 'gVCF creation requires both nonvariant_site_tfrecord_path and ' 'gvcf_outfile flags to be set.', errors.CommandLineError) proto_utils.uses_fast_cpp_protos_or_die() logging_level.set_from_flag() fasta_reader = fasta.RefFastaReader(FLAGS.ref, cache_size=_FASTA_CACHE_SIZE) contigs = fasta_reader.header.contigs paths = io_utils.maybe_generate_sharded_filenames(FLAGS.infile) # Read one CallVariantsOutput record and extract the sample name from it. # Note that this assumes that all CallVariantsOutput protos in the infile # contain a single VariantCall within their constituent Variant proto, and # that the call_set_name is identical in each of the records. record = next( io_utils.read_tfrecords( paths[0], proto=deepvariant_pb2.CallVariantsOutput, max_records=1)) sample_name = _extract_single_sample_name(record) header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) with tempfile.NamedTemporaryFile() as temp: postprocess_variants_lib.process_single_sites_tfrecords( contigs, paths, temp.name) independent_variants = _transform_call_variants_output_to_variants( input_sorted_tfrecord_path=temp.name, qual_filter=FLAGS.qual_filter, multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter, sample_name=sample_name) variant_generator = haplotypes.maybe_resolve_conflicting_variants( independent_variants) write_variants_to_vcf( variant_generator=variant_generator, output_vcf_path=FLAGS.outfile, header=header) # Also write out the gVCF file if it was provided. if FLAGS.nonvariant_site_tfrecord_path: nonvariant_generator = io_utils.read_shard_sorted_tfrecords( FLAGS.nonvariant_site_tfrecord_path, key=_get_contig_based_variant_sort_keyfn(contigs), proto=variants_pb2.Variant) with vcf.VcfReader(FLAGS.outfile, use_index=False) as variant_reader: lessthanfn = _get_contig_based_lessthan(contigs) gvcf_variants = ( _transform_to_gvcf_record(variant) for variant in variant_reader.iterate()) merged_variants = merge_variants_and_nonvariants( gvcf_variants, nonvariant_generator, lessthanfn, fasta_reader) write_variants_to_vcf( variant_generator=merged_variants, output_vcf_path=FLAGS.gvcf_outfile, header=header)