def main(argv): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: vcf_stats_report does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv[1:])), errors.CommandLineError) with vcf.VcfReader(FLAGS.input_vcf) as reader: sample_names = reader.header.sample_names if len(sample_names) != 1: raise ValueError( 'There must be exactly one sample in VCF: {}'.format( FLAGS.input_vcf)) sample_name = sample_names[0] # Missing GT causes error later while reading, so throw a clearer error here vcf_columns = [col.id for col in reader.header.formats] if 'GT' not in vcf_columns: errors.log_and_raise('ERROR: No GT sub-column in VCF.') if FLAGS.num_records == -1: variants = reader.iterate() else: variants = itertools.islice(reader.iterate(), FLAGS.num_records) vcf_stats.create_vcf_report(variants, output_basename=FLAGS.outfile_base, sample_name=sample_name, vcf_reader=reader)
def test_create_vcf_report(self): base_dir = tempfile.mkdtemp() outfile_base = os.path.join(base_dir, 'stats_test') sample_name = 'test_sample_name' with vcf.VcfReader(testdata.GOLDEN_POSTPROCESS_OUTPUT) as reader: vcf_stats.create_vcf_report(variants=reader.iterate(), output_basename=outfile_base, sample_name=sample_name, vcf_reader=reader) self.assertTrue( tf.io.gfile.exists(outfile_base + '.visual_report.html'))
def main(argv=()): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: postprocess_variants does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv)), errors.CommandLineError) del argv # Unused. if (not FLAGS.nonvariant_site_tfrecord_path) != (not FLAGS.gvcf_outfile): errors.log_and_raise( 'gVCF creation requires both nonvariant_site_tfrecord_path and ' 'gvcf_outfile flags to be set.', errors.CommandLineError) proto_utils.uses_fast_cpp_protos_or_die() logging_level.set_from_flag() fasta_reader = fasta.IndexedFastaReader( FLAGS.ref, cache_size=_FASTA_CACHE_SIZE) contigs = fasta_reader.header.contigs paths = sharded_file_utils.maybe_generate_sharded_filenames(FLAGS.infile) # Read one CallVariantsOutput record and extract the sample name from it. # Note that this assumes that all CallVariantsOutput protos in the infile # contain a single VariantCall within their constituent Variant proto, and # that the call_set_name is identical in each of the records. record = tf_utils.get_one_example_from_examples_path( ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput) if record is None: logging.info('call_variants_output is empty. Writing out empty VCF.') sample_name = dv_constants.DEFAULT_SAMPLE_NAME if FLAGS.sample_name: logging.info( '--sample_name is set in postprocess_variant. Using %s as the ' 'sample name.', FLAGS.sample_name) sample_name = FLAGS.sample_name variant_generator = iter([]) else: sample_name = _extract_single_sample_name(record) temp = tempfile.NamedTemporaryFile() start_time = time.time() postprocess_variants_lib.process_single_sites_tfrecords( contigs, paths, temp.name) logging.info('CVO sorting took %s minutes', (time.time() - start_time) / 60) logging.info('Transforming call_variants_output to variants.') independent_variants = _transform_call_variants_output_to_variants( input_sorted_tfrecord_path=temp.name, qual_filter=FLAGS.qual_filter, multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter, sample_name=sample_name, group_variants=FLAGS.group_variants, use_multiallelic_model=FLAGS.use_multiallelic_model) variant_generator = haplotypes.maybe_resolve_conflicting_variants( independent_variants) header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) use_csi = _decide_to_use_csi(contigs) start_time = time.time() if not FLAGS.nonvariant_site_tfrecord_path: logging.info('Writing variants to VCF.') write_variants_to_vcf( variant_iterable=variant_generator, output_vcf_path=FLAGS.outfile, header=header) if FLAGS.outfile.endswith('.gz'): build_index(FLAGS.outfile, use_csi) logging.info('VCF creation took %s minutes', (time.time() - start_time) / 60) else: logging.info('Merging and writing variants to VCF and gVCF.') lessthanfn = _get_contig_based_lessthan(contigs) with vcf.VcfWriter( FLAGS.outfile, header=header, round_qualities=True) as vcf_writer, \ vcf.VcfWriter( FLAGS.gvcf_outfile, header=header, round_qualities=True) \ as gvcf_writer: nonvariant_generator = tfrecord.read_shard_sorted_tfrecords( FLAGS.nonvariant_site_tfrecord_path, key=_get_contig_based_variant_sort_keyfn(contigs), proto=variants_pb2.Variant) merge_and_write_variants_and_nonvariants(variant_generator, nonvariant_generator, lessthanfn, fasta_reader, vcf_writer, gvcf_writer) if FLAGS.outfile.endswith('.gz'): build_index(FLAGS.outfile, use_csi) if FLAGS.gvcf_outfile.endswith('.gz'): build_index(FLAGS.gvcf_outfile, use_csi) logging.info('Finished writing VCF and gVCF in %s minutes.', (time.time() - start_time) / 60) if FLAGS.vcf_stats_report: outfile_base = _get_base_path(FLAGS.outfile) with vcf.VcfReader(FLAGS.outfile) as reader: vcf_stats.create_vcf_report( variants=reader.iterate(), output_basename=outfile_base, sample_name=sample_name, vcf_reader=reader) if record: temp.close()