Пример #1
0
    def test_call_from_allele_counter(self):
        ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
        sam_reader = sam.SamReader(testdata.CHR20_BAM)
        size = 1000
        region = ranges.make_range('chr20', 10000000, 10000000 + size)
        allele_counter = _allelecounter.AlleleCounter(
            ref.c_reader, region,
            deepvariant_pb2.AlleleCounterOptions(partition_size=size))
        caller = variant_calling.VariantCaller(
            deepvariant_pb2.VariantCallerOptions(min_count_snps=2,
                                                 min_count_indels=2,
                                                 min_fraction_snps=0.12,
                                                 min_fraction_indels=0.12,
                                                 sample_name='sample_name',
                                                 p_error=0.001,
                                                 max_gq=50,
                                                 gq_resolution=1,
                                                 ploidy=2))

        # Grab all of the reads in our region and add them to the allele_counter.
        reads = list(sam_reader.query(region))
        self.assertNotEmpty(reads)
        for read in reads:
            allele_counter.add(read)

        # Get the candidates records for this whole region.
        candidates = caller.calls_from_allele_counter(allele_counter)

        # We should have at least some candidates and some gvcf records.
        self.assertNotEmpty(candidates)

        # Each candidate should be a DeepVariantCall.
        for candidate in candidates:
            self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
Пример #2
0
def compute_effective_recall(model, true_indels, sam_reader, ref_reader,
                             allele_counter_options, thresholds):
    """Window size aware recall computation.

  During realignment, the region around selected positions will be realigned.
  This means that if a position is marked for realignment close enough to an
  unrecognized INDEL, that INDEL will still be realigned.
  Since scikit-learn and pandas do not offer a feature for that definition of
  recall, we need to manually check the neighborhood of each INDEL.

  Args:
    model: a scikit-learn model implementing the decision_function method.
    true_indels: a list of nucleus.Variants.
    sam_reader: a nucleus.io.SamReader.
    ref_reader: a nucleus.io.IndexedFastaReader.
    allele_counter_options: a deepvariant.AlleleCounterOptions.
    thresholds: a list of threshold to compute recall on.

  Returns:
    A dict with a threshold as key and recall at that threshold as value.
  """
    scores = collections.defaultdict(float)
    n_indels = len(true_indels)

    for indel in true_indels:
        region = ranges.make_range(indel.reference_name,
                                   indel.start - _RECALL_WINDOW,
                                   indel.start + _RECALL_WINDOW)
        allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader,
                                                     region,
                                                     allele_counter_options)

        reads = list(sam_reader.query(region))
        for read in reads:
            allele_counter.add(read)
        counts = allele_counter.counts()

        x_region = pd.DataFrame.from_records([
            allele_count_to_vector(count) for count in counts
            if _check_allele_count_quality(count)
        ])
        x_region.fillna(0)

        y_score = model.decision_function(x_region[[
            'ref_nonconfident_read_count', 'ref_supporting_read_count',
            'SUBSTITUTION', 'INSERTION', 'DELETION', 'SOFT_CLIP'
        ]])

        # This INDEL will be realigned if any position within the window is
        # marked for realignment.
        y_max = y_score.max()
        for threshold in thresholds:
            if y_max > threshold:
                scores[threshold] += 1

    for score in scores:
        scores[score] /= n_indels
    return scores
Пример #3
0
def generate_data(vcf_reader, ref_reader, sam_reader, baseline_contig,
                  exclude_contig):
    """Generates a pandas.DataFrame summarizing the AlleleCount at each position.

  The features included are:
        - 'ref_nonconfident_read_count'
        - 'ref_supporting_read_count'
        - 'SUBSTITUTION'
        - 'INSERTION'
        - 'DELETION'
        - 'SOFT_CLIP'
        - 'label'
  These features are extracted from the AlleleCount proto at the concerned
  position.

  Args:
    vcf_reader: a nucleus.io.VcfReader.
    ref_reader: a nucleus.io.IndexedFastaReader.
    sam_reader: a nucleus.io.SamReader.
    baseline_contig: string, contig from which to sample baseline positions.
    exclude_contig: string, contig to exclude for test purposes.

  Returns:
    pandas.Dataframe object.
  """

    # These parameters are the ones used in make_examples.
    read_reqs = reads_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT)
    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=1, read_requirements=read_reqs)

    training_positions = generate_positions(vcf_reader, ref_reader,
                                            baseline_contig)
    positions_records = []

    for position in training_positions:
        region = ranges.make_range(position.reference_name, position.start,
                                   position.start + 1)
        allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader,
                                                     region,
                                                     allele_counter_options)
        row = _position_to_features(sam_reader, allele_counter, region,
                                    position, exclude_contig)
        if row is not None:
            positions_records.append(row)

    df = pd.DataFrame(positions_records)
    df = df.fillna(0)
    df = shuffle(df)
    return df
Пример #4
0
 def test_wrap(self):
   ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
   sam_reader = sam.SamReader(testdata.CHR20_BAM)
   size = 100
   region = ranges.make_range('chr20', 10000000, 10000000 + size)
   options = deepvariant_pb2.AlleleCounterOptions(partition_size=size)
   allele_counter = _allelecounter.AlleleCounter(ref.c_reader, region, options)
   reads = list(sam_reader.query(region))
   self.assertGreater(len(reads), 0)
   for read in reads:
     allele_counter.add(read)
   counts = allele_counter.counts()
   self.assertEqual(len(counts), size)
Пример #5
0
def _candidates_from_reads(config, ref_reader, reads, region):
    """Returns a list of candidate positions.

  Args:
    config: learning.genomics.deepvariant.realigner.WindowSelectorOptions
      options determining the behavior of this window selector.
    ref_reader: GenomeReference. Indexed reference genome to query bases.
    reads: list[nucleus.protos.Read]. The reads we are processing into candidate
      positions.
    region: nucleus.protos.Range. The region we are processing.

  Returns:
    A list. The elements are reference positions within region.

  Raises:
    ValueError: if config.window_selector_model.model_type isn't a valid enum
    name in realigner_pb2.WindowSelectorModel.ModelType.
  """
    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        read_requirements=reads_pb2.ReadRequirements(
            min_mapping_quality=config.min_mapq,
            min_base_quality=config.min_base_quality),
        keep_legacy_behavior=config.keep_legacy_behavior)
    expanded_region = ranges.expand(region,
                                    config.region_expansion_in_bp,
                                    contig_map=ranges.contigs_dict(
                                        ref_reader.header.contigs))

    allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader,
                                                 expanded_region, [],
                                                 allele_counter_options)

    for read in reads:
        allele_counter.add(read, 'placeholder_sample_id')

    model_type = config.window_selector_model.model_type
    if model_type == realigner_pb2.WindowSelectorModel.VARIANT_READS:
        return _variant_reads_threshold_selector(
            allele_counter, config.window_selector_model.variant_reads_model,
            expanded_region)
    elif model_type == realigner_pb2.WindowSelectorModel.ALLELE_COUNT_LINEAR:
        return _allele_count_linear_selector(
            allele_counter,
            config.window_selector_model.allele_count_linear_model,
            expanded_region)
    else:
        raise ValueError('Unknown enum option "{}" for '
                         'WindowSelectorModel.model_type'.format(
                             config.window_selector_model.model_type))
Пример #6
0
 def _make_allele_counter_for_region(self, region):
   return allelecounter.AlleleCounter(self.ref_reader.get_c_reader(), region,
                                      self.options.allele_counter_options)
Пример #7
0
def compute_precision(model, true_indels, sam_reader, ref_reader,
                      allele_counter_options, thresholds, eval_region):
    """Simple precision computation for a given set of thresholds.

  This implementation computes the precision in a sliding window fashion in
  order to limit the memory consumption since it can done on a large
  `eval_region`.

  Args:
    model: a scikit-learn model implementing the decision_function method.
    true_indels: a list of nucleus.Variants expected to come from `eval_region`.
    sam_reader: a nucleus.io.SamReader.
    ref_reader: a nucleus.io.IndexedFastaReader.
    allele_counter_options: a deepvariant.AlleleCounterOptions.
    thresholds: a list of threshold to compute recall on.
    eval_region: a nucleus.v1.Range of the range to evaluate on.

  Returns:
    A dict with a threshold as key and precision at that threshold as value.
  """
    # They all come from `eval_region` so they can be identified by start.
    indels = set([var.start for var in true_indels])
    true_positives = collections.defaultdict(float)
    positives = collections.defaultdict(float)

    for position in six.moves.range(eval_region.start, eval_region.end,
                                    _WINDOW_SIZE):
        region = ranges.make_range(
            eval_region.reference_name, position,
            min(position + _WINDOW_SIZE, eval_region.end))
        allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader,
                                                     region,
                                                     allele_counter_options)

        reads = list(sam_reader.query(region))
        for read in reads:
            allele_counter.add(read)
        counts = allele_counter.counts()

        x_region_list = []
        for pos, count in enumerate(counts, start=position):
            if not _check_allele_count_quality(count):
                continue

            row = allele_count_to_vector(count)
            row['label'] = 1 if pos in indels else 0
            x_region_list.append(row)

        if not x_region_list:
            continue

        x_region = pd.DataFrame.from_records(x_region_list)
        x_region.fillna(0)

        y_score = model.decision_function(x_region[[
            'ref_nonconfident_read_count', 'ref_supporting_read_count',
            'SUBSTITUTION', 'INSERTION', 'DELETION', 'SOFT_CLIP'
        ]])

        positives_mask = x_region['label'] == 1
        for threshold in thresholds:
            positives[threshold] += sum(y_score > threshold)
            true_positives[threshold] += sum((y_score > threshold)
                                             & positives_mask)

        if position % 100000 == 0:
            print('processed %d positions out of %d -- %2f complete' %
                  (position - eval_region.start, eval_region.end -
                   eval_region.start, float(position - eval_region.start) /
                   (eval_region.end - eval_region.start)))

    precisions = {}
    for threshold in thresholds:
        if positives[threshold] == 0:
            # We arbitrarily decide that if we find no positives our precision is NaN.
            precisions[threshold] = float('nan')
        else:
            precisions[
                threshold] = true_positives[threshold] / positives[threshold]
    return precisions