Exemplo n.º 1
0
 def verify_target_genome_coverage(self, selected_probes, target_genomes,
                                   filter, desired_coverage,
                                   cover_extension=0):
     kmer_probe_map = probe.SharedKmerProbeMap.construct(
         probe.construct_kmer_probe_map_to_find_probe_covers(
             selected_probes, filter.mismatches, filter.lcf_thres,
             min_k=3, k=3)
     )
     probe.open_probe_finding_pool(kmer_probe_map,
                                   filter.cover_range_fn)
     for tg in [g for genomes_from_group in target_genomes
                for g in genomes_from_group]:
         num_bp_covered = 0
         for seq in tg.seqs:
             probe_cover_ranges = probe.find_probe_covers_in_sequence(seq)
             all_cover_ranges = []
             for cover_ranges in probe_cover_ranges.values():
                 for cv in cover_ranges:
                     start = max(0, cv[0] - cover_extension)
                     end = min(len(seq), cv[1] + cover_extension)
                     all_cover_ranges += [(start, end)]
             all_cover_ranges = interval.merge_overlapping(all_cover_ranges)
             for cover_range in all_cover_ranges:
                 num_bp_covered += cover_range[1] - cover_range[0]
         if desired_coverage <= 1.0:
             # check fraction covered
             desired_bp_covered = desired_coverage * tg.size()
             self.assertGreaterEqual(num_bp_covered, desired_bp_covered)
         else:
             # directly check num bp covered
             desired_coverage_adjusted = min(desired_coverage, tg.size())
             self.assertGreaterEqual(num_bp_covered,
                                     desired_coverage_adjusted)
     probe.close_probe_finding_pool()
Exemplo n.º 2
0
 def test_multiple_searches_with_same_pool(self):
     """Tests more than one call to find_probe_covers_in_sequence()
     with the same pool.
     """
     np.random.seed(1)
     sequence_a = 'ABCAXYZXYZDEFXYZAAYZ'
     sequence_b = 'GHIDAXYZXYZAAABCABCD'
     a = probe.Probe.from_str('AXYZXYZ')
     b = probe.Probe.from_str('AABCABC')
     probes = [a, b]
     # This should default to the random approach, so set k (rather than
     # min_k)
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                    0,
                                                                    6,
                                                                    k=3)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found_a = probe.find_probe_covers_in_sequence(sequence_a)
         self.assertEqual(found_a, {a: [(3, 10)]})
         found_b = probe.find_probe_covers_in_sequence(sequence_b)
         self.assertEqual(found_b, {a: [(4, 11)], b: [(12, 19)]})
         probe.close_probe_finding_pool()
Exemplo n.º 3
0
 def test_island_with_exact_match1(self):
     """Tests the 'island_with_exact_match' argument for
     probe.probe_covers_sequence_by_longest_common_substring(..).
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPYDEFGHQRSTU'
     a = probe.Probe.from_str('XDEFGH')
     b = probe.Probe.from_str('CXEFGH')
     c = probe.Probe.from_str('CDXFGH')
     d = probe.Probe.from_str('CDEXGH')
     e = probe.Probe.from_str('CDEFXH')
     f = probe.Probe.from_str('CDEFGX')
     g = probe.Probe.from_str('CDEFGH')
     probes = [a, b, c, d, e, f, g]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                    1,
                                                                    6,
                                                                    k=3)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     fn = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, fn, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(2, 8), (16, 22)])
         self.assertCountEqual(found[b], [(2, 8)])
         self.assertFalse(c in found)
         self.assertFalse(d in found)
         self.assertCountEqual(found[e], [(2, 8)])
         self.assertCountEqual(found[f], [(2, 8)])
         self.assertCountEqual(found[g], [(2, 8), (16, 22)])
         probe.close_probe_finding_pool()
Exemplo n.º 4
0
 def test_more_than_cover(self):
     """Tests with short sequence and short probes
     where probes contain more than what they cover.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPQR' + ('Z' * 100) + 'STUVWXYZ'
     a = probe.Probe.from_str('XYZCDEFGHIJKABCSTUVWXABC')
     b = probe.Probe.from_str('PQRSGHIJKLMNXYZ')
     c = probe.Probe.from_str('ABCFGHIJKLZAZAZAGHIJKL')
     probes = [a, b, c]
     # This should default to the random approach, so set k (rather than
     # min_k)
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                    0,
                                                                    6,
                                                                    k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(2, 11), (118, 124)])
         self.assertCountEqual(found[b], [(6, 14)])
         self.assertCountEqual(found[c], [(5, 12)])
         probe.close_probe_finding_pool()
Exemplo n.º 5
0
    def test_pigeonhole_with_mismatch(self):
        """Tests with short sequence and short probes
        where the call to construct_kmer_probe_map_to_find_probe_covers tries
        the pigeonhole approach.
        """
        np.random.seed(1)
        sequence = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        a = probe.Probe.from_str('GHIJXL')
        b = probe.Probe.from_str('BTUVWX')
        c = probe.Probe.from_str('ACEFHJ')
        probes = [a, b, c]

        kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                       1,
                                                                       6,
                                                                       min_k=3,
                                                                       k=4)
        kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
        # This should try the pigeonhole approach, which should choose k=3
        self.assertEqual(kmer_map.k, 3)
        f = probe.probe_covers_sequence_by_longest_common_substring(1, 6)
        for n_workers in [1, 2, 4, 7, 8]:
            probe.open_probe_finding_pool(kmer_map, f, n_workers)
            found = probe.find_probe_covers_in_sequence(sequence)
            self.assertCountEqual(found[a], [(6, 12)])
            self.assertCountEqual(found[b], [(18, 24)])
            self.assertFalse(c in found)
            probe.close_probe_finding_pool()

        kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                       1,
                                                                       6,
                                                                       min_k=4,
                                                                       k=4)
        kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
        # This should try the pigeonhole approach and fail because it
        # chooses k=3, but min_k=4. So it should then try the random
        # approach with k=4.
        self.assertEqual(kmer_map.k, 4)
        f = probe.probe_covers_sequence_by_longest_common_substring(1, 6)
        for n_workers in [1, 2, 4, 7, 8]:
            probe.open_probe_finding_pool(kmer_map, f, n_workers)
            found = probe.find_probe_covers_in_sequence(sequence)
            self.assertCountEqual(found[a], [(6, 12)])
            self.assertCountEqual(found[b], [(18, 24)])
            self.assertFalse(c in found)
            probe.close_probe_finding_pool()
Exemplo n.º 6
0
    def test_open_close_pool_without_work(self):
        """Tests opening a probe finding pool and closing it without doing
        any work in between.

        There was a bug, caused by a bug in early versions of Python, that
        could cause closing the pool to hang indefinitely when no work
        is submitted.
        """
        probes = [probe.Probe.from_str('ABCDEF')]
        kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
            probes, 0, 6, k=3)
        kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
        f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
        for n_workers in [1, 2, 4, 7, 8, None]:
            probe.open_probe_finding_pool(kmer_map, f, n_workers)
            time.sleep(1)
            probe.close_probe_finding_pool()
            time.sleep(1)
Exemplo n.º 7
0
 def test_repetitive(self):
     """Tests with short sequence and short probes
     where the sequence and probes have repetitive sequences, so that
     one probe can cover a lot of the sequence.
     """
     np.random.seed(1)
     sequence = 'ABCAAAAAAAAAAXYZXYZXYZXYZAAAAAAAAAAAAAXYZ'
     a = probe.Probe.from_str('NAAAAAAN')
     probes = [a]
     # This should default to the random approach, so set k (rather than
     # min_k)
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 0, 6, k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(3, 13), (25, 38)])
         probe.close_probe_finding_pool()
Exemplo n.º 8
0
 def test_island_with_exact_match2(self):
     """Tests the 'island_with_exact_match' argument for
     probe.probe_covers_sequence_by_longest_common_substring(..).
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU'
     a = probe.Probe.from_str('HXJKLMNOPCDE')
     b = probe.Probe.from_str('XIJKXMNOXCDE')
     c = probe.Probe.from_str('XIJKXMNOPXDE')
     probes = [a, b, c]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 3, 6, k=3)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     fn = probe.probe_covers_sequence_by_longest_common_substring(3, 6, 4)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, fn, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(7, 19)])
         self.assertFalse(b in found)
         self.assertCountEqual(found[c], [(7, 19)])
         probe.close_probe_finding_pool()
Exemplo n.º 9
0
 def test_two_occurrences(self):
     """Tests with short sequence and short probes
     where one probe appears twice.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU'
     a = probe.Probe.from_str('CDEFGH')
     b = probe.Probe.from_str('GHIJKL')
     c = probe.Probe.from_str('STUVWX')
     probes = [a, b, c]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 0, 6, min_k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(2, 8), (16, 22)])
         self.assertCountEqual(found[b], [(6, 12)])
         self.assertFalse(c in found)
         probe.close_probe_finding_pool()
Exemplo n.º 10
0
 def test_too_short_sequence_small_k(self):
     """Tests with sequence shorter than the probe length.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHI'
     a = probe.Probe.from_str('ABCDEFGHIJKL')
     b = probe.Probe.from_str('EFGHIJKLMNOP')
     c = probe.Probe.from_str('DEFGHIJKLMNO')
     d = probe.Probe.from_str('XYZXYZABCDEF')
     probes = [a, b, c, d]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 0, 6, min_k=6, k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(0, 9)])
         self.assertFalse(b in found)
         self.assertCountEqual(found[c], [(3, 9)])
         self.assertCountEqual(found[d], [(0, 6)])
         probe.close_probe_finding_pool()
Exemplo n.º 11
0
 def test_too_short_sequence_large_k(self):
     """Tests with sequence shorter than the probe length and also
     shorter than k.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHI'
     a = probe.Probe.from_str('ABCDEFGHIJKL')
     b = probe.Probe.from_str('EFGHIJKLMNOP')
     c = probe.Probe.from_str('DEFGHIJKLMNO')
     d = probe.Probe.from_str('XYZXYZABCDEF')
     probes = [a, b, c, d]
     # probe.find_probe_covers_in_sequence() should not attempt
     # to cover the sequence (return {}), but should run gracefully
     for k in [10, 11, 12]:
         kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
             probes, 0, 6, min_k=k, k=k)
         kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
         f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
         for n_workers in [1, 2, 4, 7, 8]:
             probe.open_probe_finding_pool(kmer_map, f, n_workers)
             found = probe.find_probe_covers_in_sequence(sequence)
             self.assertEqual(found, {})
             probe.close_probe_finding_pool()
Exemplo n.º 12
0
    def _find_covers_in_target_genomes(self):
        """Find intervals across the target genomes covered by the probe set.

        This considers the given probe set (self.probes) and determines the
        intervals, in each genome of the target genomes (as well as their
        reverse complements), that are covered by the probes. This saves a
        dict, self.target_covers, as follows: self.target_covers[i][j][b]
        is a list of all the intervals covered by the probes in the target
        genome j of grouping i (in the reverse complement of the genome if
        b is True, and provided sequence if b is False).

        The endpoints of the intervals are offset so as to give unique integer
        positions in the genome (e.g., endpoints in the second chromosome
        are offset based on the length of the first chromosome). There may
        be duplicate intervals if two probes cover the same region of a
        sequence.
        """
        logger.info("Finding probe covers across target genomes")
        logger.info("Building map from k-mers to probes")
        # Note that if adapters are added to the probes before this filter
        # is run (which would be typical), then self.lcf_thres will likely
        # be less than the probe length. So the k-mer to probe map will
        # be constructed using the random approach (yielding many k-mers
        # and thus a slower runtime in finding probe covers) rather than
        # the pigeonhole approach.
        kmer_probe_map = probe.SharedKmerProbeMap.construct(
            probe.construct_kmer_probe_map_to_find_probe_covers(
                self.probes, self.mismatches, self.lcf_thres,
                min_k=self.kmer_probe_map_k, k=self.kmer_probe_map_k)
        )
        probe.open_probe_finding_pool(kmer_probe_map,
                                      self.cover_range_fn)

        self.target_covers = {}
        for i, j, gnm, rc in self._iter_target_genomes():
            if not rc:
                logger.info(("Computing coverage in grouping %d (of %d), "
                             "with target genome %d (of %d)"), i + 1,
                            len(self.target_genomes), j + 1,
                            len(self.target_genomes[i]))
            if i not in self.target_covers:
                self.target_covers[i] = {}
            if j not in self.target_covers[i]:
                self.target_covers[i][j] = {False: None, True: None}

            gnm_covers = []
            length_so_far = 0
            for sequence in gnm.seqs:
                if rc:
                    # Take the reverse complement of sequence
                    rc_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
                    sequence = ''.join([rc_map.get(b, b)
                                       for b in sequence[::-1]])

                # Find cover ranges of the probes, while allowing the ranges
                # to overlap (e.g., if one probe covers two regions that
                # overlap)
                probe_cover_ranges = probe.find_probe_covers_in_sequence(
                    sequence,
                    merge_overlapping=False)
                for p, cover_ranges in probe_cover_ranges.items():
                    for cover_range in cover_ranges:
                        # Extend the range covered by probe p on both sides
                        # by self.cover_extension
                        cover_start = max(0,
                            cover_range[0] - self.cover_extension)
                        cover_end = min(len(sequence),
                            cover_range[1] + self.cover_extension)
                        # The endpoints of the cover give positions in just
                        # this sequence (chromosome), so adjust them (according
                        # to length_so_far) to give a unique integer position
                        # in the genome gnm
                        adjusted_cover = (cover_start + length_so_far,
                                          cover_end + length_so_far)
                        gnm_covers += [adjusted_cover]
                length_so_far += len(sequence)
            self.target_covers[i][j][rc] = gnm_covers

        probe.close_probe_finding_pool()
Exemplo n.º 13
0
    def _make_votes_across_target_genomes(self, probes):
        """Compute, for each probe, votes for adapters to the probe.

        Votes are computed, cumulatively, across all the target genomes in
        self.target_genomes.

        Args:
            probes: list of candidate probes

        Returns:
            a list L such that L[i] is a tuple (A,B) where A gives the
            number of 'A' adapter votes for the probe probes[i] and B gives
            the number of 'B' adapter votes.
        """
        logger.info("Building map from k-mers to probes")
        kmer_probe_map = probe.SharedKmerProbeMap.construct(
            probe.construct_kmer_probe_map_to_find_probe_covers(
                probes,
                self.mismatches,
                self.lcf_thres,
                min_k=self.kmer_probe_map_k,
                k=self.kmer_probe_map_k)
        )
        probe.open_probe_finding_pool(kmer_probe_map,
                                      self.cover_range_fn)

        def iter_all_seqs():
            for genomes_from_group in self.target_genomes:
                for g in genomes_from_group:
                    for seq in g.seqs:
                        yield seq

        # Store adapter votes for each probe in a list where the element
        # at index i is a tuple (A,B) that corresponds to the probe
        # probes[i] where A gives the 'A' votes for the probe and B gives
        # the 'B' votes
        cumulative_votes = [(0, 0) for _ in range(len(probes))]
        for sequence in iter_all_seqs():
            # Compute votes for the adapters for each probe in 'sequence',
            # and also exchange all 'A' votes with 'B' votes and vice-versa.
            # Determine whether or not the exchange matches better with
            # cumulative_votes so far, and update cumulative_votes
            # accordingly.
            votes = self._votes_in_sequence(probes, sequence)
            votes_flipped = self._flip_AB_votes(votes)
            cumulative_votes_with_nonflipped = self._sum_votes_per_probe(
                cumulative_votes, votes)
            sum_nonflipped = self._sum_plurality_vote_across_probes(
                cumulative_votes_with_nonflipped)
            cumulative_votes_with_flipped = self._sum_votes_per_probe(
                cumulative_votes, votes_flipped)
            sum_flipped = self._sum_plurality_vote_across_probes(
                cumulative_votes_with_flipped)
            if sum_flipped > sum_nonflipped:
                # Add onto cumulative votes the votes in 'votes_flipped'
                # because these could be said to yield a more decisive
                # choice of adapter for each probe (i.e., the sum, across
                # all probes, of the most common vote of adapter for the
                # probe is higher) than the (unflipped) votes in 'votes'
                cumulative_votes = cumulative_votes_with_flipped
            else:
                cumulative_votes = cumulative_votes_with_nonflipped

        probe.close_probe_finding_pool()

        return cumulative_votes
Exemplo n.º 14
0
    def _make_ranks(self, candidate_probes, target_genomes):
        """Return a rank for each candidate probe to use in set cover.

        The "rank" of a candidate probe is a level of penalty for that
        probe, where higher ranks are more penalized. A set cover is sought
        that uses as many candidate probes from rank i as possible before
        considering probes with rank i+1. There are two considerations in
        computing ranks:
          - When identification is turned on (i.e., self.identify is True),
            the number of species that a probe "hits". Fewer hit species
            yields a smaller rank.
          - The number of bases in blacklisted genomes that the probe
            covers. Fewer covered bases yields a smaller rank.
        A probe that covers any part of a blacklisted genome will always
        receive a higher rank than a probe that does not. (This is achieved
        by first computing ranks using tuples of the form (x,y) where x=0
        for any probe that does not cover a blacklisted genome and x=1
        for a probe that does; y determines relative rank among those probes
        with the same x value. The tuple ranks are then converted into
        integer ranks by sorting the tuples.) When identification is
        enabled, a probe that hits more than one grouping (e.g., species)
        will always receive a higher rank than a probe that only hits one
        grouping (and does not cover any blacklisted genomes).

        When identification is not turned on, weighted set cover
        effectively does the following: (1) Covers as much of the target
        genomes as possible while minimizing the number of probes, without
        using any probe that covers any part of a blacklisted genome. (2)
        Covers whatever portions of the target genomes remain to be covered
        by using probes that cover parts of blacklisted genomes, while
        first seeking probes that cover less of the blacklisted genomes
        (i.e., even if probe B covers much more of the target genomes
        than probe A, A will be chosen before B if B covers a tiny bit
        more of the blacklisted genomes than A).
        When identification is turned on, weighted set cover: (1) Covers
        as much of the target genomes as possible while minimizing the
        number of probes, only using probes that hit one grouping. (2)
        Covers whatever portions of the target genomes remain to be covered
        while minimizing the number of probes, only using probes that hit
        two groupings, etc. (3) Considers probes that cover parts of
        blacklisted genomes, if there remains more of the target genomes to
        cover.

        The output is intended for input to set_cover.approx_multiuniverse
        as the 'ranks' input.

        Args:
            candidate_probes: list of candidate probes

        Returns:
            dict mapping set_ids (0 through len(candidate_probes)-1, each
            corresponding to a candidate probe) to a rank (integer) for
            that candidate probe
        """
        # Only open a probe finding pool if it will be needed
        need_probe_finding_pool = (self.identify
                                   or len(self.blacklisted_genomes) > 0)
        if need_probe_finding_pool:
            logger.info("Building map from k-mers to probes")
            kmer_probe_map = probe.SharedKmerProbeMap.construct(
                probe.construct_kmer_probe_map_to_find_probe_covers(
                    candidate_probes,
                    self.mismatches_tolerant,
                    self.lcf_thres_tolerant,
                    min_k=self.kmer_probe_map_k,
                    k=self.kmer_probe_map_k))
            probe.open_probe_finding_pool(
                kmer_probe_map,
                self.cover_range_tolerant_fn,
                use_native_dict=self.kmer_probe_map_use_native_dict)

        if self.identify:
            # Find the number of target genome groupings (e.g., species)
            # that each probe "hits". (A probe "hits" a grouping if it
            # covers a part of at least one target genome in that grouping.)
            # A probe that hits just one grouping is good for
            # identification and is therefore ranked relatively low (a
            # rank of 1); probes that hit more than one grouping are poor
            # for identification and their ranks are equal to the number
            # of groupings they hit.
            num_groupings_hit = self._count_num_groupings_hit(
                candidate_probes, target_genomes)
            rank_val = {p: (0, hit) for p, hit in num_groupings_hit.items()}
        else:
            # Start each probe with the same rank
            rank_val = {p: (0, 0) for p in candidate_probes}

        # Find probes that cover part of a blacklisted genome.
        # All of these get a higher rank than any probe that does not
        # cover any part of a blacklisted genome (since the first element
        # of the tuple put into rank_val is 1, but 0 was the first
        # element of the tuple above) and the rank among these is based
        # on the number of bp they cover.
        blacklisted_bp_covered = self._count_blacklisted_bp_covered(
            candidate_probes)
        for p, bp in blacklisted_bp_covered.items():
            if bp > 0:
                rank_val[p] = (1, bp)

        if need_probe_finding_pool:
            probe.close_probe_finding_pool()
            del kmer_probe_map
            gc.collect()

        # Convert the ranks, specified as tuples, into ranks from 0
        # upward. The probe(s) with the smallest tuple rank get(s)
        # rank 0, the probe(s) with the next smallest tuple rank get(s)
        # rank 1, and so on..
        all_rank_tuples = sorted(set(rank_val.values()))
        tuple_rank_idx = {}
        for i in range(len(all_rank_tuples)):
            tuple_rank_idx[all_rank_tuples[i]] = i
        ranks = {}
        for set_id, p in enumerate(candidate_probes):
            ranks[set_id] = tuple_rank_idx[rank_val[p]]

        return ranks
Exemplo n.º 15
0
    def _make_sets(self, candidate_probes, target_genomes):
        """Return a collection of sets to use in set cover.

        In the returned collection of sets, each set corresponds to a
        candidate probe and contains the bases of the target genomes
        covered by the candidate probe. The target genomes must be in
        grouped lists inside the list target_genomes.

        The output is intended for input to set_cover.approx_multiuniverse
        as the 'sets' input.

        Args:
            candidate_probes: list of candidate probes
            target_genomes: list of groups of target genomes

        Returns:
            a dict mapping set_ids (from 0 through
            len(candidate_probes)-1) to dicts, where the dict for a
            particular set_id maps universe_ids to sets. set_id
            corresponds to a candidate probe in candidate_probes and
            universe_id is a tuple that corresponds to a target genome in
            a grouping from target_genomes. The j'th target genome
            from the i'th grouping in target_genomes is given
            universe_id equal to (i,j). That is, i ranges from 0 through
            len(target_genomes)-1 (i.e., the number of groupings) and
            j ranges from 0 through (n_i)-1 where n_i is the number of
            target genomes in the i'th group. In the returned value
            (sets), sets[set_id][universe_id] is a set of all the bases
            (as an instance of interval.IntervalSet) covered by probe
            set_id in the target genome universe_id. (If
            sets[set_id][universe_id] contains just one interval, then that
            interval is stored directly as a tuple -- not in an instance
            of interval.IntervalSet -- to save space and it should be
            coverted to an interval.IntervalSet when needed.)
        """
        logger.info("Building map from k-mers to probes")
        kmer_probe_map = probe.SharedKmerProbeMap.construct(
            probe.construct_kmer_probe_map_to_find_probe_covers(
                candidate_probes,
                self.mismatches,
                self.lcf_thres,
                min_k=self.kmer_probe_map_k,
                k=self.kmer_probe_map_k))
        probe.open_probe_finding_pool(kmer_probe_map, self.cover_range_fn)

        probe_id = {}
        sets = {}
        for id, p in enumerate(candidate_probes):
            probe_id[p] = id
            sets[id] = {}

        for i, genomes_from_group in enumerate(target_genomes):
            for j, gnm in enumerate(genomes_from_group):
                logger.info(("Computing coverage in grouping %d (of %d), "
                             "with target genome %d (of %d)"), i + 1,
                            len(target_genomes), j + 1,
                            len(genomes_from_group))
                universe_id = (i, j)
                length_so_far = 0
                for sequence in gnm.seqs:
                    probe_cover_ranges = probe.find_probe_covers_in_sequence(
                        sequence)
                    # Add the bases of sequence that are covered by all the
                    # probes into sets with universe_id equal to (i,j)
                    for p, cover_ranges in probe_cover_ranges.items():
                        set_id = probe_id[p]
                        for cover_range in cover_ranges:
                            # Extend the range covered by probe p on both sides
                            # by self.cover_extension
                            cover_start = max(
                                0, cover_range[0] - self.cover_extension)
                            cover_end = min(
                                len(sequence),
                                cover_range[1] + self.cover_extension)
                            # The endpoints of the cover give positions in
                            # just this sequence (chromosome), so adding the
                            # lengths of all the sequences previously iterated
                            # (length_so_far) onto them gives unique
                            # integer positions in the genome gnm
                            adjusted_cover = (cover_start + length_so_far,
                                              cover_end + length_so_far)
                            if universe_id not in sets[set_id]:
                                # Since a list has a lot of overhead and most
                                # probes align to just one interval, simply
                                # store that interval alone (not in a list)
                                sets[set_id][universe_id] = adjusted_cover
                            else:
                                prev_cover = sets[set_id][universe_id]
                                if isinstance(prev_cover, tuple):
                                    # This probe now aligns to two intervals in
                                    # this universe/genome, so store them in
                                    # a list
                                    sets[set_id][universe_id] = [prev_cover]
                                sets[set_id][universe_id].append(
                                    adjusted_cover)
                    length_so_far += len(sequence)

        probe.close_probe_finding_pool()
        del kmer_probe_map
        gc.collect()

        # Make an IntervalSet out of the intervals of each set. But if
        # there is just one interval in a set, then save space by leaving
        # that entry as a tuple.
        for set_id in sets.keys():
            for universe_id in sets[set_id].keys():
                intervals = sets[set_id][universe_id]
                if not isinstance(intervals, tuple):
                    sets[set_id][universe_id] = interval.IntervalSet(intervals)
                # Else, there is just one interval in this set; leave it
                # stored directly as a tuple

        return sets
Exemplo n.º 16
0
    def run_random(self,
                   n,
                   genome_min,
                   genome_max,
                   num_probes,
                   probe_length=100,
                   lcf_thres=None,
                   seed=1,
                   n_workers=2,
                   use_native_dict=False):
        """Run tests with a randomly generated sequence.

        Repeatedly runs tests in which a sequence is randomly generated,
        probes are generated from that sequence, and then the probes are
        looked up in the sequence.

        Creates the probes with the intention of determining coverage with
        a longest common substring.

        Args:
            n: number of times to run the test
            genome_min/genome_max: the genome (sequence) size is
                randomly chosen between genome_min and genome_max
            num_probes: the number of probes generated from the random
                sequence
            probe_length: number of bp to make each probe
            lcf_thres: lcf threshold parameter; when None, it is
                randomly chosen among 80 and 100
            seed: random number generator seed
            n_workers: number of workers to have in a probe finding pool
            use_native_dict: have the probe finding pool use a native Python
                dict
        """
        np.random.seed(seed)
        fixed_lcf_thres = lcf_thres

        for n in range(n):
            if fixed_lcf_thres is not None:
                lcf_thres = fixed_lcf_thres
            else:
                # Choose either lcf_thres=80 or lcf_thres=100
                lcf_thres = np.random.choice([80, 100])
            # Make a random sequence
            seq_length = np.random.randint(genome_min, genome_max)
            sequence = "".join(
                np.random.choice(['A', 'T', 'C', 'G'],
                                 size=seq_length,
                                 replace=True))
            desired_probe_cover_ranges = defaultdict(list)
            # Make num_probes random probes
            probes = []
            for m in range(num_probes):
                subseq_start = np.random.randint(0, seq_length - probe_length)
                subseq_end = subseq_start + probe_length
                cover_length = np.random.randint(lcf_thres, probe_length + 1)
                cover_start = subseq_start + \
                    np.random.randint(0, probe_length - cover_length + 1)
                cover_end = min(seq_length, cover_start + cover_length)
                probe_str_cover = sequence[cover_start:cover_end]
                # Add random bases before and after what the probe should
                # cover
                probe_str_start = "".join(
                    np.random.choice(['A', 'T', 'C', 'G'],
                                     size=cover_start - subseq_start,
                                     replace=True))
                probe_str_end = "".join(
                    np.random.choice(['A', 'T', 'C', 'G'],
                                     size=subseq_end - cover_end,
                                     replace=True))
                probe_str = probe_str_start + probe_str_cover + probe_str_end
                # Add 0, 1, 2, or 3 random mismatches
                for k in range(np.random.randint(0, 4)):
                    pos = np.random.randint(0, probe_length)
                    base_choices = [
                        b for b in ['A', 'T', 'C', 'G'] if b != probe_str[pos]
                    ]
                    probe_str = probe_str[:pos] + \
                        "".join(np.random.choice(base_choices, size=1)) + \
                        probe_str[(pos + 1):]
                p = probe.Probe.from_str(probe_str)
                desired_probe_cover_ranges[p].append((cover_start, cover_end))
                probes += [p]
            kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
                probes, 3, lcf_thres)
            kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
            f = probe.probe_covers_sequence_by_longest_common_substring(
                3, lcf_thres)
            probe.open_probe_finding_pool(kmer_map,
                                          f,
                                          n_workers,
                                          use_native_dict=use_native_dict)
            found = probe.find_probe_covers_in_sequence(sequence)
            probe.close_probe_finding_pool()
            # Check that this didn't find any extraneous probes and that
            # it found at least 95% of the original (it may miss some
            # due to false negatives in the approach)
            self.assertLessEqual(len(found), len(probes))
            self.assertGreaterEqual(len(found), 0.95 * len(probes))
            # Check that each desired probe was found correctly
            for p, cover_ranges in desired_probe_cover_ranges.items():
                if p not in found:
                    continue
                found_cover_ranges = found[p]
                # This probe most likely was found once, but could have
                # been missed (due to false negatives in the approach) and
                # may have been found more than once due to chance (but
                # probably not too much more!)
                self.assertTrue(len(found_cover_ranges) in [1, 2])
                # The cover ranges should have been captured, and the ones
                # found may extend past what was desired by a small amount due
                # to allowing mismatches and chance
                # Because of mismatches possibly added to the end of the
                # desired cover range, what was recaptured may not always
                # encompass the entire cover range, so allow some small
                # tolerance
                for desired_cv in cover_ranges:
                    found_desired_cv = False
                    for found_cv in found_cover_ranges:
                        left_diff = desired_cv[0] - found_cv[0]
                        right_diff = found_cv[1] - desired_cv[1]
                        if left_diff >= -7 and left_diff < 15:
                            if right_diff >= -7 and right_diff < 15:
                                found_desired_cv = True
                                break
                    self.assertTrue(found_desired_cv)