Exemplo n.º 1
0
    def commonNucleotidesAgreementDistance(a, b):
        """
        Measure the distance from one cluster to another.

        The distance is 1.0 minus the fraction of common sites that either
        (a) agree on what the most common nucleotide is a the site or else
        (b) where one cluster has an overwhelming opinion about the most
        likely nucleotide.

            In (a), each site in both clusters is examined for its most
            frequent nucleotide set (there may be more than one equally
            frequent nucleotide). If the intersection of the two sets for a
            site is non-empty, that site counts as matching. E.g. if the
            cluster for one site has 6 x A and 2 x C and the same site in
            the other cluster has 3 x A and 3 x G, the sites agree because
            A is in the most common nucleotides set for each cluster.

            In (b) if, e.g., one cluster has 100 x A and the other cluster
            has just 3 x G, then you could argue that the two clusters
            don't really differ at that site because in a merged cluster
            the As would completely overwhelm the Gs. So we count such
            sites as matching too, so long as the numerical dominance of
            one cluster over the other is at least
            self.MIN_COMMONEST_MULTIPLE

        The fraction of common sites matching under (a) or (b) is a measure
        of similarity, so we return 1.0 minus that in order to have a distance.

        @param a: A C{ReadCluster} instance.
        @param b: A C{ReadCluster} instance.
        @return: The C{float} [0.0, 1.0] distance between C{a} and C{b}.

        """
        aNucleotides = a.nucleotides
        bNucleotides = b.nucleotides
        commonOffsets = set(aNucleotides) & set(bNucleotides)

        if commonOffsets:
            matching = 0
            for offset in commonOffsets:
                aNucleotidesAtOffset = aNucleotides[offset]
                bNucleotidesAtOffset = bNucleotides[offset]
                if (aNucleotidesAtOffset.commonest
                        & bNucleotidesAtOffset.commonest):
                    # This is case (a) above.
                    matching += 1
                else:
                    multiple = OffsetBases.highestFrequenciesMultiple(
                        aNucleotidesAtOffset, bNucleotidesAtOffset)
                    # Sanity: the multiple cannot be None because that
                    # would mean only one nucleotide is present, and that
                    # case is dealt with by the first part of this if/then.
                    assert multiple is not None
                    if multiple >= ReadCluster.MIN_COMMONEST_MULTIPLE:
                        # This is case (b) above.
                        matching += 1

            return 1.0 - (matching / len(commonOffsets))
        else:
            return 1.0
Exemplo n.º 2
0
    def mergeDescription(self, a, b, distance):
        """
        Make a textual description of a cluster merge.

        @param a: An C{int} cluster number.
        @param b: An C{int} cluster number.
        @param distance: The C{float} [0.0, 1.0] distance between the clusters.
        @return: A C{str} side-by-side descriptions of clusters C{a} and C{b}.
        """
        cluster1 = self.readClusters[a]
        cluster2 = self.readClusters[b]

        result1 = []
        result2 = []
        matches = []
        sharedCount = matchCount = 0

        allOffsets = sorted(
            set(cluster1.nucleotides) | set(cluster2.nucleotides))

        for offset in allOffsets:

            inCount = 0

            if offset in cluster1.nucleotides:
                result1.append(cluster1.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result1.append('-')

            if offset in cluster2.nucleotides:
                result2.append(cluster2.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result2.append('-')

            if inCount == 2:
                sharedCount += 1
                if (cluster1.nucleotides[offset].commonest
                        & cluster2.nucleotides[offset].commonest):
                    matches.append('*')
                    matchCount += 1
                else:
                    multiple = OffsetBases.highestFrequenciesMultiple(
                        cluster1.nucleotides[offset],
                        cluster2.nucleotides[offset])
                    # Sanity: the multiple cannot be None because that
                    # would mean only one nucleotide is present, and that
                    # case is dealt with by the first part of this if/then.
                    assert multiple is not None
                    if multiple >= ReadCluster.MIN_COMMONEST_MULTIPLE:
                        matchCount += 1
                        matches.append('+')
                    else:
                        matches.append('')
            else:
                matches.append('')

        result1Width = max(len(line) for line in result1)
        result2Width = max(len(line) for line in result2)

        return '\n'.join([
            ('Merging clusters %d and %d with distance %.2f' %
             (a, b, distance)),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (a, len(cluster1.reads), s(len(cluster1.reads), ),
              len(cluster1.nucleotides), s(len(cluster1.nucleotides)))),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (b, len(cluster2.reads), s(len(cluster2.reads)),
              len(cluster2.nucleotides), s(len(cluster2.nucleotides)))),
            ('%d matches out of %d shared offsets' %
             (matchCount, sharedCount)),
        ] + [
            '  %d: %*s    %*s    %s' %
            (offset + 1, result1Width, line1, result2Width, line2, match)
            for (offset, line1, line2,
                 match) in zip(allOffsets, result1, result2, matches)
        ])