def test_from_regions_not_empty(self): literals = ['chr1', 'chr2:10-20'] self.assertItemsEqual( [ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 9, 20)], ranges.RangeSet.from_regions( literals, ranges.contigs_dict(_TEST_CONTIGS)))
def format_contig_matches(): pieces = [] common_map = ranges.contigs_dict(shared_contigs) for ref_contig in ref_contigs: status = 'matched' if ref_contig.name in common_map else 'IS MISSING' pieces.append('"{}" is {} bp and {}'.format(ref_contig.name, ref_contig.n_bases, status)) return ', '.join(pieces)
def common2(contigs1, contigs2): """Computes the common contigs between contigs1 and contigs2.""" map2 = ranges.contigs_dict(contigs2) def is_common(contig1): contig2 = map2.get(contig1.name, None) return contig2 and contig1.n_bases == contig2.n_bases return [c for c in contigs1 if is_common(c)]
def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl, eval_region, output_report_csv): """Outputs precision-recall for a sklearn model using AlleleCount features. Args: truth_variants: path to the VCF. reads: path to the reads BAM. ref: path to the reference FASTA. input_model_pckl: path to read the LogisticRegression pickle from. eval_region: str, region to evaluate on in the 'chr:start-end', 'chr:position' or 'chr' format. output_report_csv: path to the output report csv. Raises: ValueError: if eval_region cannot be parsed. """ sam_reader = sam.SamReader(reads) ref_reader = fasta.IndexedFastaReader(ref) read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=1, read_requirements=read_reqs) model = joblib.load(input_model_pckl) with vcf.VcfReader(truth_variants) as vcf_reader: region = ranges.parse_literal(eval_region, contig_map=ranges.contigs_dict( ref_reader.header.contigs)) true_indels = [ var for var in vcf_reader.query(region) if (variant_utils.is_indel(var)) ] precisions = compute_precision(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS, region) recalls = compute_effective_recall(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS) with tf.gfile.GFile(output_report_csv, 'w') as csvfile: fieldnames = ['threshold', 'precision', 'recall'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for threshold in _THRESHOLDS: writer.writerow({ 'threshold': threshold, 'precision': precisions[threshold], 'recall': recalls[threshold] })
def _candidates_from_reads(config, ref_reader, reads, region): """Returns a list of candidate positions. Args: config: learning.genomics.deepvariant.realigner.WindowSelectorOptions options determining the behavior of this window selector. ref_reader: GenomeReference. Indexed reference genome to query bases. reads: list[nucleus.protos.Read]. The reads we are processing into candidate positions. region: nucleus.protos.Range. The region we are processing. Returns: A list. The elements are reference positions within region. Raises: ValueError: if config.window_selector_model.model_type isn't a valid enum name in realigner_pb2.WindowSelectorModel.ModelType. """ allele_counter_options = deepvariant_pb2.AlleleCounterOptions( read_requirements=reads_pb2.ReadRequirements( min_mapping_quality=config.min_mapq, min_base_quality=config.min_base_quality), keep_legacy_behavior=config.keep_legacy_behavior) expanded_region = ranges.expand(region, config.region_expansion_in_bp, contig_map=ranges.contigs_dict( ref_reader.header.contigs)) allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader, expanded_region, [], allele_counter_options) for read in reads: allele_counter.add(read, 'placeholder_sample_id') model_type = config.window_selector_model.model_type if model_type == realigner_pb2.WindowSelectorModel.VARIANT_READS: return _variant_reads_threshold_selector( allele_counter, config.window_selector_model.variant_reads_model, expanded_region) elif model_type == realigner_pb2.WindowSelectorModel.ALLELE_COUNT_LINEAR: return _allele_count_linear_selector( allele_counter, config.window_selector_model.allele_count_linear_model, expanded_region) else: raise ValueError('Unknown enum option "{}" for ' 'WindowSelectorModel.model_type'.format( config.window_selector_model.model_type))
def build_calling_regions(contigs, regions_to_include, regions_to_exclude): """Builds a RangeSet containing the regions we should call variants in. This function intersects the Ranges spanning all of the contigs with those from regions_to_include, if not empty, and removes all of the regions in regions_to_exclude. Args: contigs: Sequence of ContigInfo protos. Used to determine the initial ranges to process (i.e., all bases of these contigs). regions_to_include: RangeSet or iterable that can be converted to a RangeSet. regions_to_exclude: RangeSet or iterable that can be converted to a RangeSet. Returns: A RangeSet. """ # Initially we are going to call everything in the reference. regions = ranges.RangeSet.from_contigs(contigs) # If we provided a regions to include, intersect it with all of the regions, # producing a common set of regions between the reference and the provided # calling regions. contig_dict = ranges.contigs_dict(contigs) if regions_to_include: regions = regions.intersection( ranges.RangeSet.from_regions(regions_to_include, contig_dict)) # If we provided regions to exclude, intersect those with the existing calling # regions to further refine our set of contigs to process. if regions_to_exclude: # exclude_regions mutates regions. regions.exclude_regions( ranges.RangeSet.from_regions(regions_to_exclude, contig_dict)) return regions
def test_query(self, query, expected_variant_indices): range1 = ranges.parse_literal(query, ranges.contigs_dict(self.header.contigs)) self.assertEqual(list(self.reader.query(range1)), [self.variants[i] for i in expected_variant_indices])
def test_query(self, query, expected_variant_indices): range1 = ranges.parse_literal(query, ranges.contigs_dict( self.header.contigs)) self.assertEqual( list(self.reader.query(range1)), [self.variants[i] for i in expected_variant_indices])