예제 #1
0
 def test_reservoir_sample_frequency(self, iterable_size, k):
   """Tests observed frequency is close to expected frequency."""
   # Use a fixed random number so our test is deterministic.
   random = np.random.RandomState(123456789)
   n_replicates = 100000
   counts = collections.Counter(
       item
       for _ in range(n_replicates)
       for item in utils.reservoir_sample(range(iterable_size), k, random))
   expected_frequency = min(k / float(iterable_size), 1.0)
   for c in counts.itervalues():
     observed_frequency = c / float(n_replicates)
     npt.assert_allclose(observed_frequency, expected_frequency, atol=0.01)
예제 #2
0
 def test_reservoir_sample_frequency(self, iterable_size, k):
     """Tests observed frequency is close to expected frequency."""
     # Use a fixed random number so our test is deterministic.
     random = np.random.RandomState(123456789)
     n_replicates = 100000
     counts = collections.Counter(item for _ in range(n_replicates)
                                  for item in utils.reservoir_sample(
                                      range(iterable_size), k, random))
     expected_frequency = min(k / float(iterable_size), 1.0)
     for c in counts.itervalues():
         observed_frequency = c / float(n_replicates)
         npt.assert_allclose(observed_frequency,
                             expected_frequency,
                             atol=0.01)
예제 #3
0
 def test_reservoir_sample_length(self):
     """Tests samples have expected length."""
     first_ten_ints = range(10)
     # Test sampling with k > len(iterable).
     self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 11)), 10)
     # Test sampling with k == len(iterable).
     self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 10)), 10)
     # Test sampling with k < len(iterable).
     self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 9)), 9)
     # Test sampling with k == 0.
     self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 0)), 0)
     # Test sampling with k < 0 (bad args).
     with self.assertRaises(ValueError):
         utils.reservoir_sample(first_ten_ints, -1)
예제 #4
0
 def test_reservoir_sample_length(self):
   """Tests samples have expected length."""
   first_ten_ints = range(10)
   # Test sampling with k > len(iterable).
   self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 11)), 10)
   # Test sampling with k == len(iterable).
   self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 10)), 10)
   # Test sampling with k < len(iterable).
   self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 9)), 9)
   # Test sampling with k == 0.
   self.assertEquals(len(utils.reservoir_sample(first_ten_ints, 0)), 0)
   # Test sampling with k < 0 (bad args).
   with self.assertRaises(ValueError):
     utils.reservoir_sample(first_ten_ints, -1)
예제 #5
0
  def region_reads(self, region):
    """Update in_memory_sam_reader with read alignments overlapping the region.

    If self.realigner is set, uses realigned reads, otherwise original reads
    are returned.

    Args:
      region: A nucleus.genomics.v1.Range object specifying the region we
        want to realign reads.

    Returns:
      [genomics.deepvariant.core.genomics.Read], reads overlapping the region.
    """
    reads = self.sam_reader.query(region)
    if self.options.max_reads_per_partition > 0:
      reads = utils.reservoir_sample(
          reads, self.options.max_reads_per_partition, self.random)
    reads = list(reads)
    if self.realigner:
      _, reads = self.realigner.realign_reads(reads, region)
    return reads
예제 #6
0
  def build_pileup(self,
                   dv_call,
                   refbases,
                   reads,
                   alt_alleles,
                   custom_ref=False):
    """Creates a pileup tensor for dv_call.

    Args:
      dv_call: learning.genomics.deepvariant.DeepVariantCall object with
        information on our candidate call and allele support information.
      refbases: A string options.width in length containing the reference base
        sequence to encode. The middle base of this string should be at the
        start of the variant in dv_call.
      reads: Iterable of third_party.nucleus.protos.Read objects that we'll use
        to encode the read information supporting our call. Assumes each read is
        aligned and is well-formed (e.g., has bases and quality scores, cigar).
        Rows of the image are encoded in the same order as reads.
      alt_alleles: A collection of alternative_bases from dv_call.variant that
        we are treating as "alt" when constructing this pileup image. A read
        will be considered supporting the "alt" allele if it occurs in the
        support list for any alt_allele in this collection.
      custom_ref: True if refbases should not be checked for matching against
        variant's reference_bases.

    Returns:
      A [self.width, self.height, DEFAULT_NUM_CHANNEL] uint8 Tensor image.

    Raises:
      ValueError: if any arguments are invalid.
    """
    if len(refbases) != self.width:
      raise ValueError('refbases is {} long but width is {}'.format(
          len(refbases), self.width))

    if not alt_alleles:
      raise ValueError('alt_alleles cannot be empty')
    if any(alt not in dv_call.variant.alternate_bases for alt in alt_alleles):
      raise ValueError(
          'all elements of alt_alleles must be the alternate bases'
          ' of dv_call.variant', alt_alleles, dv_call.variant)

    image_start_pos = dv_call.variant.start - self.half_width
    if not custom_ref and (refbases[self.half_width] !=
                           dv_call.variant.reference_bases[0]):
      raise ValueError('The middle base of reference sequence in the window '
                       "({} at base {}) doesn't match first "
                       'character of variant.reference_bases ({}).'.format(
                           refbases[self.half_width], self.half_width,
                           dv_call.variant.reference_bases))

    # We start with n copies of our encoded reference bases.
    rows = ([self._encoder.encode_reference(refbases)] *
            self.reference_band_height)

    # A generator that yields tuples of the form (position, row), iff the read
    # can be encoded as a valid row to be used in the pileup image.
    def _row_generator():
      for read in reads:
        read_row = self._encoder.encode_read(dv_call, refbases, read,
                                             image_start_pos, alt_alleles)
        if read_row is not None:
          yield read.alignment.position.position, read_row

    # We add a row for each read in order, down-sampling if the number of reads
    # is greater than self.max_reads. Sort the reads by their alignment
    # position.
    random_for_image = np.random.RandomState(self._options.random_seed)
    sample = sorted(
        utils.reservoir_sample(
            _row_generator(), self.max_reads, random=random_for_image),
        key=lambda x: x[0])

    rows += [read_row for _, read_row in sample]

    # Finally, fill in any missing rows to bring our image to self.height rows
    # with empty (all black) pixels.
    n_missing_rows = self.height - len(rows)
    if n_missing_rows > 0:
      # Add values to rows to fill it out with zeros.
      rows += [self._empty_image_row()] * n_missing_rows

    # Vertically stack the image rows to create a single
    # h x w x DEFAULT_NUM_CHANNEL image.
    return np.vstack(rows)
예제 #7
0
  def build_pileup(self, dv_call, refbases, reads, alt_alleles):
    """Creates a pileup tensor for dv_call.

    Args:
      dv_call: learning.genomics.deepvariant.DeepVariantCall object with
        information on our candidate call and allele support information.
      refbases: A string options.width in length containing the reference base
        sequence to encode. The middle base of this string should be at the
        start of the variant in dv_call.
      reads: Iterable of third_party.nucleus.protos.Read
        objects that we'll use to
        encode the read information supporting our call. Assumes each read is
        aligned and is well-formed (e.g., has bases and quality scores, cigar).
        Rows of the image are encoded in the same order as reads.
      alt_alleles: A collection of alternative_bases from dv_call.variant that
        we are treating as "alt" when constructing this pileup image. A read
        will be considered supporting the "alt" allele if it occurs in the
        support list for any alt_allele in this collection.

    Returns:
      A [self.width, self.height, DEFAULT_NUM_CHANNEL] uint8 Tensor image.

    Raises:
      ValueError: if any arguments are invalid.
    """
    if len(refbases) != self.width:
      raise ValueError('refbases is {} long but width is {}'.format(
          len(refbases), self.width))

    if not alt_alleles:
      raise ValueError('alt_alleles cannot be empty')
    if any(alt not in dv_call.variant.alternate_bases for alt in alt_alleles):
      raise ValueError('all elements of alt_alleles must be the alternate bases'
                       ' of dv_call.variant', alt_alleles, dv_call.variant)

    image_start_pos = dv_call.variant.start - self.half_width
    if (len(dv_call.variant.reference_bases) == 1 and
        refbases[self.half_width] != dv_call.variant.reference_bases):
      raise ValueError('center of refbases doesnt match variant.refbases',
                       self.half_width, refbases[self.half_width],
                       dv_call.variant)

    # We start with n copies of our encoded reference bases.
    rows = (
        [self._encoder.encode_reference(refbases)] * self.reference_band_height)

    # A generator that yields tuples of the form (position, row), iff the read
    # can be encoded as a valid row to be used in the pileup image.
    def _row_generator():
      for read in reads:
        read_row = self._encoder.encode_read(dv_call, refbases, read,
                                             image_start_pos, alt_alleles)
        if read_row is not None:
          yield read.alignment.position.position, read_row

    # We add a row for each read in order, down-sampling if the number of reads
    # is greater than self.max_reads. Sort the reads by their alignment
    # position.
    sample = sorted(
        utils.reservoir_sample(
            _row_generator(), self.max_reads, random=self._random),
        key=lambda x: x[0])

    rows += [read_row for _, read_row in sample]

    # Finally, fill in any missing rows to bring our image to self.height rows
    # with empty (all black) pixels.
    n_missing_rows = self.height - len(rows)
    if n_missing_rows > 0:
      # Add values to rows to fill it out with zeros.
      rows += [_empty_image_row(len(refbases))] * n_missing_rows

    # Vertically stack the image rows to create a single
    # h x w x DEFAULT_NUM_CHANNEL image.
    return np.vstack(rows)
예제 #8
0
    def build_pileup_for_one_sample(reads, sample):
      """Create read pileup image section for one sample."""
      # We start with n copies of our encoded reference bases.
      rows = ([self._encoder.encode_reference(refbases)] *
              self.reference_band_height)

      def _update_hap_index(read, sort_by_haplotypes_sample_hp_tag):
        default_hap_idx = 0  # By default, reads with no HP is set to 0.
        if 'HP' not in read.info:
          return default_hap_idx
        hp_field = next(iter(read.info.get('HP').values))
        if not hp_field.HasField('int_value'):
          return default_hap_idx
        hp_value = hp_field.int_value
        if (sort_by_haplotypes_sample_hp_tag > 0 and
            hp_value == sort_by_haplotypes_sample_hp_tag):
          # For the target HP tag, set it to -1 so it will be sorted on
          # top of the pileup image.
          return -1
        elif hp_value < 0:
          return 0  # For reads with HP < 0, assume it is not tagged.
        else:
          return hp_value

      # A generator that yields tuples of the form (haplotype, position, row),
      # if the read can be encoded as a valid row to be used in the pileup
      # image.
      def _row_generator():
        """A generator that yields tuples of (haplotype, position, row)."""
        for read in reads:
          read_row = self._encoder.encode_read(dv_call, refbases, read,
                                               image_start_pos, alt_alleles)
          if read_row is None:
            continue
          hap_idx = 0  # By default, reads with no HP is set to 0.
          if self._options.sort_by_haplotypes:
            hap_idx = _update_hap_index(
                read, self._options.sort_by_haplotypes_sample_hp_tag)
          yield hap_idx, read.alignment.position.position, read_row

      # We add a row for each read in order, down-sampling if the number of
      # reads is greater than the max reads for each sample. Sort the reads by
      # their alignment position.
      random_for_image = np.random.RandomState(self._options.random_seed)

      # Use sample height or default to pic height.
      if sample.pileup_height is not None:
        pileup_height = sample.pileup_height
      else:
        pileup_height = self.height

      max_reads = pileup_height - self.reference_band_height
      pileup_of_reads = sorted(
          utils.reservoir_sample(
              _row_generator(), max_reads, random=random_for_image),
          key=lambda x: (x[0], x[1]))

      rows += [read_row for _, _, read_row in pileup_of_reads]

      # Finally, fill in any missing rows to bring our image to pileup_height
      # rows with empty (all black) pixels.
      n_missing_rows = pileup_height - len(rows)
      if n_missing_rows > 0:
        # Add values to rows to fill it out with zeros.
        rows += [self._empty_image_row()] * n_missing_rows
      return rows