예제 #1
0
 def test_variants_overlap(self):
   v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
   v2 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=20)
   with mock.patch.object(ranges, 'ranges_overlap') as mock_overlap:
     mock_overlap.return_value = 'SENTINEL'
     self.assertEqual(variant_utils.variants_overlap(v1, v2), 'SENTINEL')
     mock_overlap.assert_called_once_with(
         variant_utils.variant_range(v1), variant_utils.variant_range(v2))
예제 #2
0
 def test_variants_overlap(self):
   v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
   v2 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=20)
   with mock.patch.object(ranges, 'ranges_overlap') as mock_overlap:
     mock_overlap.return_value = 'SENTINEL'
     self.assertEqual(variant_utils.variants_overlap(v1, v2), 'SENTINEL')
     mock_overlap.assert_called_once_with(
         variant_utils.variant_range(v1), variant_utils.variant_range(v2))
예제 #3
0
 def test_variant_position_and_range(self):
   v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
   v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10)
   pos = ranges.make_range('1', 10, 11)
   range_ = ranges.make_range('1', 10, 14)
   v1_range_tuple = ('1', 10, 11)
   v2_range_tuple = ('1', 10, 14)
   self.assertEqual(pos, variant_utils.variant_position(v1))
   self.assertEqual(pos, variant_utils.variant_position(v2))
   self.assertEqual(pos, variant_utils.variant_range(v1))
   self.assertEqual(range_, variant_utils.variant_range(v2))
   self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1))
   self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
예제 #4
0
 def test_variant_position_and_range(self):
   v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
   v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10)
   pos = ranges.make_range('1', 10, 11)
   range_ = ranges.make_range('1', 10, 14)
   v1_range_tuple = ('1', 10, 11)
   v2_range_tuple = ('1', 10, 14)
   self.assertEqual(pos, variant_utils.variant_position(v1))
   self.assertEqual(pos, variant_utils.variant_position(v2))
   self.assertEqual(pos, variant_utils.variant_range(v1))
   self.assertEqual(range_, variant_utils.variant_range(v2))
   self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1))
   self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
예제 #5
0
def _transform_call_variants_output_to_variants(
    input_sorted_tfrecord_path, qual_filter, multi_allelic_qual_filter,
    sample_name):
  """Yields Variant protos in sorted order from CallVariantsOutput protos.

  Variants present in the input TFRecord are converted to Variant protos, with
  the following filters applied: 1) variants are omitted if their quality is
  lower than the `qual_filter` threshold. 2) multi-allelic variants omit
  individual alleles whose qualities are lower than the
  `multi_allelic_qual_filter` threshold.

  Args:
    input_sorted_tfrecord_path: str. TFRecord format file containing sorted
      CallVariantsOutput protos.
    qual_filter: double. The qual value below which to filter variants.
    multi_allelic_qual_filter: double. The qual value below which to filter
      multi-allelic variants.
    sample_name: str. Sample name to write to VCF file.

  Yields:
    Variant protos in sorted order representing the CallVariantsOutput calls.
  """
  for _, group in itertools.groupby(
      io_utils.read_tfrecords(
          input_sorted_tfrecord_path, proto=deepvariant_pb2.CallVariantsOutput),
      lambda x: variant_utils.variant_range(x.variant)):
    outputs = list(group)
    canonical_variant, predictions = merge_predictions(
        outputs, multi_allelic_qual_filter)
    variant = add_call_to_variant(
        canonical_variant,
        predictions,
        qual_filter=qual_filter,
        sample_name=sample_name)
    yield variant
예제 #6
0
def _transform_call_variants_output_to_variants(input_sorted_tfrecord_path,
                                                qual_filter,
                                                multi_allelic_qual_filter,
                                                sample_name):
    """Yields Variant protos in sorted order from CallVariantsOutput protos.

  Variants present in the input TFRecord are converted to Variant protos, with
  the following filters applied: 1) variants are omitted if their quality is
  lower than the `qual_filter` threshold. 2) multi-allelic variants omit
  individual alleles whose qualities are lower than the
  `multi_allelic_qual_filter` threshold.

  Args:
    input_sorted_tfrecord_path: str. TFRecord format file containing sorted
      CallVariantsOutput protos.
    qual_filter: double. The qual value below which to filter variants.
    multi_allelic_qual_filter: double. The qual value below which to filter
      multi-allelic variants.
    sample_name: str. Sample name to write to VCF file.

  Yields:
    Variant protos in sorted order representing the CallVariantsOutput calls.
  """
    for _, group in itertools.groupby(
            io_utils.read_tfrecords(input_sorted_tfrecord_path,
                                    proto=deepvariant_pb2.CallVariantsOutput),
            lambda x: variant_utils.variant_range(x.variant)):
        outputs = _sort_grouped_variants(group)
        canonical_variant, predictions = merge_predictions(
            outputs, multi_allelic_qual_filter)
        variant = add_call_to_variant(canonical_variant,
                                      predictions,
                                      qual_filter=qual_filter,
                                      sample_name=sample_name)
        yield variant
예제 #7
0
def _transform_call_variants_output_to_variants(input_sorted_tfrecord_path,
                                                qual_filter,
                                                multi_allelic_qual_filter,
                                                sample_name, group_variants,
                                                use_multiallelic_model):
  """Yields Variant protos in sorted order from CallVariantsOutput protos.

  Variants present in the input TFRecord are converted to Variant protos, with
  the following filters applied: 1) variants are omitted if their quality is
  lower than the `qual_filter` threshold. 2) multi-allelic variants omit
  individual alleles whose qualities are lower than the
  `multi_allelic_qual_filter` threshold.

  Args:
    input_sorted_tfrecord_path: str. TFRecord format file containing sorted
      CallVariantsOutput protos.
    qual_filter: double. The qual value below which to filter variants.
    multi_allelic_qual_filter: double. The qual value below which to filter
      multi-allelic variants.
    sample_name: str. Sample name to write to VCF file.
    group_variants: bool. If true, group variants that have same start and end
      position.
    use_multiallelic_model: if True, use a specialized model for genotype
      resolution of multiallelic cases with two alts.

  Yields:
    Variant protos in sorted order representing the CallVariantsOutput calls.
  """
  multiallelic_model = get_multiallelic_model(
      use_multiallelic_model=use_multiallelic_model)
  group_fn = None
  if group_variants:
    group_fn = lambda x: variant_utils.variant_range(x.variant)
  for _, group in itertools.groupby(
      tfrecord.read_tfrecords(
          input_sorted_tfrecord_path, proto=deepvariant_pb2.CallVariantsOutput),
      group_fn):
    outputs = _sort_grouped_variants(group)
    canonical_variant, predictions = merge_predictions(
        outputs,
        multi_allelic_qual_filter,
        multiallelic_model=multiallelic_model)
    variant = add_call_to_variant(
        canonical_variant,
        predictions,
        qual_filter=qual_filter,
        sample_name=sample_name)
    yield variant
예제 #8
0
    def _label_grouped_variants(self, variants):
        # redacted

        # redacted
        # they should be computed in the grouping.
        span = ranges.span([variant_utils.variant_range(v) for v in variants])
        truths = list(
            self._get_truth_variants(
                ranges.expand(span,
                              _TRUTH_VARIANTS_QUERY_REGION_EXPANSION_IN_BP)))

        if len(truths) > self.max_group_size:
            logging.warning((
                'Found a large number of variants to label (n_candidates=%d, '
                'n_truth=%d) relative to candidate cap of %d. This may make the '
                'algorithm very slow.'), len(variants), len(truths),
                            self.max_group_size)
            # redacted
            logging.warning(
                'Returning all variants with not-confident markers.')
            for variant in variants:
                yield variant_labeler.VariantLabel(is_confident=False,
                                                   genotype=(-1, -1),
                                                   variant=variant)
            return
        ref = self.make_labeler_ref(variants, truths)
        labeled_variants = label_variants(variants, truths, ref)

        if not labeled_variants:
            raise ValueError('Failed to assign labels for variants', variants)
        else:
            for labeled in labeled_variants:
                yield variant_labeler.VariantLabel(
                    # redacted
                    # now. Rethink how we establish a variant is confident. Seems like
                    # it'd be confident if it has a non-ref genotype (as we only
                    # consider confident truth variants) or if it overlaps the confident
                    # regions.
                    is_confident=self._confident_regions.variant_overlaps(
                        labeled),
                    genotype=tuple(labeled.calls[0].genotype),
                    variant=labeled)
예제 #9
0
  def _label_grouped_variants(self, variants):
    # redacted

    # redacted
    # they should be computed in the grouping.
    span = ranges.span([variant_utils.variant_range(v) for v in variants])
    truths = list(
        self._get_truth_variants(
            ranges.expand(span, _TRUTH_VARIANTS_QUERY_REGION_EXPANSION_IN_BP)))

    if len(truths) > self.max_group_size:
      logging.warning(
          ('Found a large number of variants to label (n_candidates=%d, '
           'n_truth=%d) relative to candidate cap of %d. This may make the '
           'algorithm very slow.'), len(variants), len(truths),
          self.max_group_size)
      # redacted
      logging.warning('Returning all variants with not-confident markers.')
      for variant in variants:
        yield variant_labeler.VariantLabel(
            is_confident=False, genotype=(-1, -1), variant=variant)
      return
    ref = self.make_labeler_ref(variants, truths)
    labeled_variants = label_variants(variants, truths, ref)

    if not labeled_variants:
      raise ValueError('Failed to assign labels for variants', variants)
    else:
      for labeled in labeled_variants:
        yield variant_labeler.VariantLabel(
            # redacted
            # now. Rethink how we establish a variant is confident. Seems like
            # it'd be confident if it has a non-ref genotype (as we only
            # consider confident truth variants) or if it overlaps the confident
            # regions.
            is_confident=self._confident_regions.variant_overlaps(labeled),
            genotype=tuple(labeled.calls[0].genotype),
            variant=labeled)
예제 #10
0
파일: vcf.py 프로젝트: zyxue/deepvariant
 def query(self, region):
     return iter(variant for variant in self.variants
                 if ranges.ranges_overlap(
                     variant_utils.variant_range(variant), region))
예제 #11
0
파일: vcf.py 프로젝트: kong75/deepvariant
 def query(self, region):
   return iter(
       variant for variant in self.variants
       if ranges.ranges_overlap(variant_utils.variant_range(variant), region)
   )