def test_call_from_allele_counter(self): ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA) sam_reader = sam.SamReader(testdata.CHR20_BAM) size = 1000 region = ranges.make_range('chr20', 10000000, 10000000 + size) allele_counter = _allelecounter.AlleleCounter( ref.c_reader, region, deepvariant_pb2.AlleleCounterOptions(partition_size=size)) caller = variant_calling.VariantCaller( deepvariant_pb2.VariantCallerOptions(min_count_snps=2, min_count_indels=2, min_fraction_snps=0.12, min_fraction_indels=0.12, sample_name='sample_name', p_error=0.001, max_gq=50, gq_resolution=1, ploidy=2)) # Grab all of the reads in our region and add them to the allele_counter. reads = list(sam_reader.query(region)) self.assertNotEmpty(reads) for read in reads: allele_counter.add(read) # Get the candidates records for this whole region. candidates = caller.calls_from_allele_counter(allele_counter) # We should have at least some candidates and some gvcf records. self.assertNotEmpty(candidates) # Each candidate should be a DeepVariantCall. for candidate in candidates: self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
def compute_effective_recall(model, true_indels, sam_reader, ref_reader, allele_counter_options, thresholds): """Window size aware recall computation. During realignment, the region around selected positions will be realigned. This means that if a position is marked for realignment close enough to an unrecognized INDEL, that INDEL will still be realigned. Since scikit-learn and pandas do not offer a feature for that definition of recall, we need to manually check the neighborhood of each INDEL. Args: model: a scikit-learn model implementing the decision_function method. true_indels: a list of nucleus.Variants. sam_reader: a nucleus.io.SamReader. ref_reader: a nucleus.io.IndexedFastaReader. allele_counter_options: a deepvariant.AlleleCounterOptions. thresholds: a list of threshold to compute recall on. Returns: A dict with a threshold as key and recall at that threshold as value. """ scores = collections.defaultdict(float) n_indels = len(true_indels) for indel in true_indels: region = ranges.make_range(indel.reference_name, indel.start - _RECALL_WINDOW, indel.start + _RECALL_WINDOW) allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, region, allele_counter_options) reads = list(sam_reader.query(region)) for read in reads: allele_counter.add(read) counts = allele_counter.counts() x_region = pd.DataFrame.from_records([ allele_count_to_vector(count) for count in counts if _check_allele_count_quality(count) ]) x_region.fillna(0) y_score = model.decision_function(x_region[[ 'ref_nonconfident_read_count', 'ref_supporting_read_count', 'SUBSTITUTION', 'INSERTION', 'DELETION', 'SOFT_CLIP' ]]) # This INDEL will be realigned if any position within the window is # marked for realignment. y_max = y_score.max() for threshold in thresholds: if y_max > threshold: scores[threshold] += 1 for score in scores: scores[score] /= n_indels return scores
def generate_data(vcf_reader, ref_reader, sam_reader, baseline_contig, exclude_contig): """Generates a pandas.DataFrame summarizing the AlleleCount at each position. The features included are: - 'ref_nonconfident_read_count' - 'ref_supporting_read_count' - 'SUBSTITUTION' - 'INSERTION' - 'DELETION' - 'SOFT_CLIP' - 'label' These features are extracted from the AlleleCount proto at the concerned position. Args: vcf_reader: a nucleus.io.VcfReader. ref_reader: a nucleus.io.IndexedFastaReader. sam_reader: a nucleus.io.SamReader. baseline_contig: string, contig from which to sample baseline positions. exclude_contig: string, contig to exclude for test purposes. Returns: pandas.Dataframe object. """ # These parameters are the ones used in make_examples. read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=1, read_requirements=read_reqs) training_positions = generate_positions(vcf_reader, ref_reader, baseline_contig) positions_records = [] for position in training_positions: region = ranges.make_range(position.reference_name, position.start, position.start + 1) allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, region, allele_counter_options) row = _position_to_features(sam_reader, allele_counter, region, position, exclude_contig) if row is not None: positions_records.append(row) df = pd.DataFrame(positions_records) df = df.fillna(0) df = shuffle(df) return df
def test_wrap(self): ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA) sam_reader = sam.SamReader(testdata.CHR20_BAM) size = 100 region = ranges.make_range('chr20', 10000000, 10000000 + size) options = deepvariant_pb2.AlleleCounterOptions(partition_size=size) allele_counter = _allelecounter.AlleleCounter(ref.c_reader, region, options) reads = list(sam_reader.query(region)) self.assertGreater(len(reads), 0) for read in reads: allele_counter.add(read) counts = allele_counter.counts() self.assertEqual(len(counts), size)
def _candidates_from_reads(config, ref_reader, reads, region): """Returns a list of candidate positions. Args: config: learning.genomics.deepvariant.realigner.WindowSelectorOptions options determining the behavior of this window selector. ref_reader: GenomeReference. Indexed reference genome to query bases. reads: list[nucleus.protos.Read]. The reads we are processing into candidate positions. region: nucleus.protos.Range. The region we are processing. Returns: A list. The elements are reference positions within region. Raises: ValueError: if config.window_selector_model.model_type isn't a valid enum name in realigner_pb2.WindowSelectorModel.ModelType. """ allele_counter_options = deepvariant_pb2.AlleleCounterOptions( read_requirements=reads_pb2.ReadRequirements( min_mapping_quality=config.min_mapq, min_base_quality=config.min_base_quality), keep_legacy_behavior=config.keep_legacy_behavior) expanded_region = ranges.expand(region, config.region_expansion_in_bp, contig_map=ranges.contigs_dict( ref_reader.header.contigs)) allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, expanded_region, [], allele_counter_options) for read in reads: allele_counter.add(read, 'placeholder_sample_id') model_type = config.window_selector_model.model_type if model_type == realigner_pb2.WindowSelectorModel.VARIANT_READS: return _variant_reads_threshold_selector( allele_counter, config.window_selector_model.variant_reads_model, expanded_region) elif model_type == realigner_pb2.WindowSelectorModel.ALLELE_COUNT_LINEAR: return _allele_count_linear_selector( allele_counter, config.window_selector_model.allele_count_linear_model, expanded_region) else: raise ValueError('Unknown enum option "{}" for ' 'WindowSelectorModel.model_type'.format( config.window_selector_model.model_type))
def _make_allele_counter_for_region(self, region): return allelecounter.AlleleCounter(self.ref_reader.get_c_reader(), region, self.options.allele_counter_options)
def compute_precision(model, true_indels, sam_reader, ref_reader, allele_counter_options, thresholds, eval_region): """Simple precision computation for a given set of thresholds. This implementation computes the precision in a sliding window fashion in order to limit the memory consumption since it can done on a large `eval_region`. Args: model: a scikit-learn model implementing the decision_function method. true_indels: a list of nucleus.Variants expected to come from `eval_region`. sam_reader: a nucleus.io.SamReader. ref_reader: a nucleus.io.IndexedFastaReader. allele_counter_options: a deepvariant.AlleleCounterOptions. thresholds: a list of threshold to compute recall on. eval_region: a nucleus.v1.Range of the range to evaluate on. Returns: A dict with a threshold as key and precision at that threshold as value. """ # They all come from `eval_region` so they can be identified by start. indels = set([var.start for var in true_indels]) true_positives = collections.defaultdict(float) positives = collections.defaultdict(float) for position in six.moves.range(eval_region.start, eval_region.end, _WINDOW_SIZE): region = ranges.make_range( eval_region.reference_name, position, min(position + _WINDOW_SIZE, eval_region.end)) allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, region, allele_counter_options) reads = list(sam_reader.query(region)) for read in reads: allele_counter.add(read) counts = allele_counter.counts() x_region_list = [] for pos, count in enumerate(counts, start=position): if not _check_allele_count_quality(count): continue row = allele_count_to_vector(count) row['label'] = 1 if pos in indels else 0 x_region_list.append(row) if not x_region_list: continue x_region = pd.DataFrame.from_records(x_region_list) x_region.fillna(0) y_score = model.decision_function(x_region[[ 'ref_nonconfident_read_count', 'ref_supporting_read_count', 'SUBSTITUTION', 'INSERTION', 'DELETION', 'SOFT_CLIP' ]]) positives_mask = x_region['label'] == 1 for threshold in thresholds: positives[threshold] += sum(y_score > threshold) true_positives[threshold] += sum((y_score > threshold) & positives_mask) if position % 100000 == 0: print('processed %d positions out of %d -- %2f complete' % (position - eval_region.start, eval_region.end - eval_region.start, float(position - eval_region.start) / (eval_region.end - eval_region.start))) precisions = {} for threshold in thresholds: if positives[threshold] == 0: # We arbitrarily decide that if we find no positives our precision is NaN. precisions[threshold] = float('nan') else: precisions[ threshold] = true_positives[threshold] / positives[threshold] return precisions