Exemplo n.º 1
0
    def filterOverlaps(self, overlapPercCutoff=70):
        """Filtering out amplicons that substantially overlap.
        The amplicon with the highest PPC with be kept.
        The MFEprimerRes attribute must be set.
        in-place edit of MFEprimerRes object (table object filtered of overlaps)
        
        Parmeters
        ---------
        overlapPercCutoff : float
            percent of overlap to consider 'substantially' overlapping
        """
        if self.MFEprimerRes is None:
            msg = 'genome object does not have MFEprimerRes attribute.' + \
                  ' Run MFEprimer() first'
            raise AttributeError, msg

        # making interval tree
        tree = IntervalTree()

        # loading intervals
        for count, row in self.MFEprimerRes.iterrows():
            # sanity check for + strand
            if row['BindingStart'] > row['BindingStop']:
                raise TypeError('MFEprimer binding start-stop is not + strand')
            tree.addi(row['BindingStart'], row['BindingStop'],
                      [count, row['PPC'], row['Size']])

        # finding all that substantially overlap; keeping one with > PPC
        tree2 = tree.copy()
        for iv1 in tree.iter():
            # skipping if already removed from tree2
            if not iv1 in tree2:
                continue

            overlaps = tree.search(iv1.begin, iv1.end)

            # skipping those that poorly overlap
            lowOverlap = set()
            for iv2 in overlaps:
                if iv1.overlaps(iv2):
                    percOverlaps = self._calcPercOverlap(iv1, iv2)
                    if percOverlaps[0] < overlapPercCutoff:
                        lowOverlap.add(iv2)
            overlaps = overlaps - lowOverlap  # just list of substantially overlapping

            # skipping those that have been already removed
            prevRm = set([x for x in overlaps if x not in tree2])
            overlaps = overlaps - prevRm

            # removing all substantially overlapping intervals with lower PPC
            if len(overlaps) > 1:
                overlaps = sorted(overlaps,
                                  key=lambda x: x.data[1],
                                  reverse=True)
                for o in overlaps[1:]:
                    if o in tree2:
                        tree2.remove(o)
            else:
                pass

        # selecting columns
        iv_idx = [x.data[0] for x in tree2.iter()]
        self.MFEprimerRes = self.MFEprimerRes.iloc[iv_idx]