예제 #1
0
 def test_from_regions_not_empty(self):
   literals = ['chr1', 'chr2:10-20']
   self.assertItemsEqual(
       [ranges.make_range('chr1', 0, 10),
        ranges.make_range('chr2', 9, 20)],
       ranges.RangeSet.from_regions(
           literals, ranges.contigs_dict(_TEST_CONTIGS)))
예제 #2
0
 def format_contig_matches():
   pieces = []
   common_map = ranges.contigs_dict(shared_contigs)
   for ref_contig in ref_contigs:
     status = 'matched' if ref_contig.name in common_map else 'IS MISSING'
     pieces.append('"{}" is {} bp and {}'.format(ref_contig.name,
                                                 ref_contig.n_bases, status))
   return ', '.join(pieces)
예제 #3
0
  def common2(contigs1, contigs2):
    """Computes the common contigs between contigs1 and contigs2."""
    map2 = ranges.contigs_dict(contigs2)

    def is_common(contig1):
      contig2 = map2.get(contig1.name, None)
      return contig2 and contig1.n_bases == contig2.n_bases

    return [c for c in contigs1 if is_common(c)]
예제 #4
0
def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl,
                            eval_region, output_report_csv):
    """Outputs precision-recall for a sklearn model using AlleleCount features.

  Args:
    truth_variants: path to the VCF.
    reads: path to the reads BAM.
    ref: path to the reference FASTA.
    input_model_pckl: path to read the LogisticRegression pickle from.
    eval_region: str, region to evaluate on in the 'chr:start-end',
      'chr:position' or 'chr' format.
    output_report_csv: path to the output report csv.

  Raises:
    ValueError: if eval_region cannot be parsed.
  """
    sam_reader = sam.SamReader(reads)
    ref_reader = fasta.IndexedFastaReader(ref)

    read_reqs = reads_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT)
    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=1, read_requirements=read_reqs)

    model = joblib.load(input_model_pckl)

    with vcf.VcfReader(truth_variants) as vcf_reader:
        region = ranges.parse_literal(eval_region,
                                      contig_map=ranges.contigs_dict(
                                          ref_reader.header.contigs))
        true_indels = [
            var for var in vcf_reader.query(region)
            if (variant_utils.is_indel(var))
        ]

    precisions = compute_precision(model, true_indels, sam_reader, ref_reader,
                                   allele_counter_options, _THRESHOLDS, region)
    recalls = compute_effective_recall(model, true_indels, sam_reader,
                                       ref_reader, allele_counter_options,
                                       _THRESHOLDS)

    with tf.gfile.GFile(output_report_csv, 'w') as csvfile:
        fieldnames = ['threshold', 'precision', 'recall']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for threshold in _THRESHOLDS:
            writer.writerow({
                'threshold': threshold,
                'precision': precisions[threshold],
                'recall': recalls[threshold]
            })
예제 #5
0
def _candidates_from_reads(config, ref_reader, reads, region):
    """Returns a list of candidate positions.

  Args:
    config: learning.genomics.deepvariant.realigner.WindowSelectorOptions
      options determining the behavior of this window selector.
    ref_reader: GenomeReference. Indexed reference genome to query bases.
    reads: list[nucleus.protos.Read]. The reads we are processing into candidate
      positions.
    region: nucleus.protos.Range. The region we are processing.

  Returns:
    A list. The elements are reference positions within region.

  Raises:
    ValueError: if config.window_selector_model.model_type isn't a valid enum
    name in realigner_pb2.WindowSelectorModel.ModelType.
  """
    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        read_requirements=reads_pb2.ReadRequirements(
            min_mapping_quality=config.min_mapq,
            min_base_quality=config.min_base_quality),
        keep_legacy_behavior=config.keep_legacy_behavior)
    expanded_region = ranges.expand(region,
                                    config.region_expansion_in_bp,
                                    contig_map=ranges.contigs_dict(
                                        ref_reader.header.contigs))

    allele_counter = allelecounter.AlleleCounter(ref_reader.c_reader,
                                                 expanded_region, [],
                                                 allele_counter_options)

    for read in reads:
        allele_counter.add(read, 'placeholder_sample_id')

    model_type = config.window_selector_model.model_type
    if model_type == realigner_pb2.WindowSelectorModel.VARIANT_READS:
        return _variant_reads_threshold_selector(
            allele_counter, config.window_selector_model.variant_reads_model,
            expanded_region)
    elif model_type == realigner_pb2.WindowSelectorModel.ALLELE_COUNT_LINEAR:
        return _allele_count_linear_selector(
            allele_counter,
            config.window_selector_model.allele_count_linear_model,
            expanded_region)
    else:
        raise ValueError('Unknown enum option "{}" for '
                         'WindowSelectorModel.model_type'.format(
                             config.window_selector_model.model_type))
예제 #6
0
def build_calling_regions(contigs, regions_to_include, regions_to_exclude):
  """Builds a RangeSet containing the regions we should call variants in.

  This function intersects the Ranges spanning all of the contigs with those
  from regions_to_include, if not empty, and removes all of the regions in
  regions_to_exclude.

  Args:
    contigs: Sequence of ContigInfo protos. Used to determine the initial ranges
      to process (i.e., all bases of these contigs).
    regions_to_include: RangeSet or iterable that can be converted to a
      RangeSet.
    regions_to_exclude: RangeSet or iterable that can be converted to a
      RangeSet.

  Returns:
    A RangeSet.
  """
  # Initially we are going to call everything in the reference.
  regions = ranges.RangeSet.from_contigs(contigs)

  # If we provided a regions to include, intersect it with all of the regions,
  # producing a common set of regions between the reference and the provided
  # calling regions.
  contig_dict = ranges.contigs_dict(contigs)
  if regions_to_include:
    regions = regions.intersection(
        ranges.RangeSet.from_regions(regions_to_include, contig_dict))

  # If we provided regions to exclude, intersect those with the existing calling
  # regions to further refine our set of contigs to process.
  if regions_to_exclude:
    # exclude_regions mutates regions.
    regions.exclude_regions(
        ranges.RangeSet.from_regions(regions_to_exclude, contig_dict))

  return regions
예제 #7
0
 def test_query(self, query, expected_variant_indices):
     range1 = ranges.parse_literal(query,
                                   ranges.contigs_dict(self.header.contigs))
     self.assertEqual(list(self.reader.query(range1)),
                      [self.variants[i] for i in expected_variant_indices])
예제 #8
0
 def test_query(self, query, expected_variant_indices):
   range1 = ranges.parse_literal(query, ranges.contigs_dict(
       self.header.contigs))
   self.assertEqual(
       list(self.reader.query(range1)),
       [self.variants[i] for i in expected_variant_indices])