Exemplo n.º 1
0
  def test_vcf_query(self):
    tabix.build_index(self.output_file)
    self.input_reader = vcf.VcfReader(self.input_file)
    self.output_reader = vcf.VcfReader(self.output_file)

    range1 = ranges.parse_literal('chr3:100,000-500,000')
    self.assertEqual(
        list(self.input_reader.query(range1)),
        list(self.output_reader.query(range1)))
Exemplo n.º 2
0
def build_index(vcf_file, csi=False):
  """A helper function for indexing VCF files.

  Args:
    vcf_file: string. Path to the VCF file to be indexed.
    csi: bool. If true, index using the CSI format.
  """

  if csi:
    tabix.build_csi_index(vcf_file, min_shift=14)
  else:
    tabix.build_index(vcf_file)
Exemplo n.º 3
0
 def test_build_index(self):
   self.assertFalse(gfile.Exists(self.tbx_index_file))
   tabix.build_index(self.output_file)
   self.assertTrue(gfile.Exists(self.tbx_index_file))
Exemplo n.º 4
0
def main(argv=()):
    with errors.clean_commandline_error_exit():
        if len(argv) > 1:
            errors.log_and_raise(
                'Command line parsing failure: postprocess_variants does not accept '
                'positional arguments but some are present on the command line: '
                '"{}".'.format(str(argv)), errors.CommandLineError)
        del argv  # Unused.

        if (not FLAGS.nonvariant_site_tfrecord_path) != (
                not FLAGS.gvcf_outfile):
            errors.log_and_raise(
                'gVCF creation requires both nonvariant_site_tfrecord_path and '
                'gvcf_outfile flags to be set.', errors.CommandLineError)

        proto_utils.uses_fast_cpp_protos_or_die()

        logging_level.set_from_flag()

        fasta_reader = fasta.IndexedFastaReader(FLAGS.ref,
                                                cache_size=_FASTA_CACHE_SIZE)
        contigs = fasta_reader.header.contigs
        paths = sharded_file_utils.maybe_generate_sharded_filenames(
            FLAGS.infile)
        # Read one CallVariantsOutput record and extract the sample name from it.
        # Note that this assumes that all CallVariantsOutput protos in the infile
        # contain a single VariantCall within their constituent Variant proto, and
        # that the call_set_name is identical in each of the records.
        record = tf_utils.get_one_example_from_examples_path(
            ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput)
        if record is None:
            raise ValueError('Cannot find any records in {}'.format(
                ','.join(paths)))

        sample_name = _extract_single_sample_name(record)
        header = dv_vcf_constants.deepvariant_header(
            contigs=contigs, sample_names=[sample_name])
        with tempfile.NamedTemporaryFile() as temp:
            start_time = time.time()
            postprocess_variants_lib.process_single_sites_tfrecords(
                contigs, paths, temp.name)
            logging.info('CVO sorting took %s minutes',
                         (time.time() - start_time) / 60)

            logging.info('Transforming call_variants_output to variants.')
            start_time = time.time()
            independent_variants = _transform_call_variants_output_to_variants(
                input_sorted_tfrecord_path=temp.name,
                qual_filter=FLAGS.qual_filter,
                multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter,
                sample_name=sample_name)
            variant_generator = haplotypes.maybe_resolve_conflicting_variants(
                independent_variants)

            start_time = time.time()
            if not FLAGS.nonvariant_site_tfrecord_path:
                logging.info('Writing variants to VCF.')
                write_variants_to_vcf(variant_iterable=variant_generator,
                                      output_vcf_path=FLAGS.outfile,
                                      header=header)
                if FLAGS.outfile.endswith('.gz'):
                    tabix.build_index(FLAGS.outfile)
                logging.info('VCF creation took %s minutes',
                             (time.time() - start_time) / 60)
            else:
                logging.info('Merging and writing variants to VCF and gVCF.')
                lessthanfn = _get_contig_based_lessthan(contigs)
                with vcf.VcfWriter(
                    FLAGS.outfile, header=header, round_qualities=True) as vcf_writer, \
                    vcf.VcfWriter(
                        FLAGS.gvcf_outfile, header=header, round_qualities=True) \
                    as gvcf_writer:
                    nonvariant_generator = tfrecord.read_shard_sorted_tfrecords(
                        FLAGS.nonvariant_site_tfrecord_path,
                        key=_get_contig_based_variant_sort_keyfn(contigs),
                        proto=variants_pb2.Variant)
                    merge_and_write_variants_and_nonvariants(
                        variant_generator, nonvariant_generator, lessthanfn,
                        fasta_reader, vcf_writer, gvcf_writer)
                if FLAGS.outfile.endswith('.gz'):
                    tabix.build_index(FLAGS.outfile)
                if FLAGS.gvcf_outfile.endswith('.gz'):
                    tabix.build_index(FLAGS.gvcf_outfile)
                logging.info('Finished writing VCF and gVCF in %s minutes.',
                             (time.time() - start_time) / 60)