Exemplo n.º 1
0
def is_fusion_compatible(r1, r2, max_fusion_point_dist, allow_extra_5_exons):
    """
    Helper function for: merge_fusion_exons()

    Check that:
    (1) r1, r2 and both in the 5', or both in the 3'
    (2) if single-exon, fusion point must be close by
        if multi-exon, every junction identical (plus below is True)
    (3) if allow_extra_5_exons is False, num exons must be the same
        if allow_extra_5_exons is True, only allow additional 5' exons
    """
    MAX_QSTART_FOR_5 = 100
    # first need to figure out ends
    # also check that both are in the 5' portion of r1 and r2
    assert r1.flag.strand == r2.flag.strand
    if r1.qStart < MAX_QSTART_FOR_5:  # in the 5' portion of r1
        if r2.qStart > MAX_QSTART_FOR_5:  # in the 3' portion, reject
            return False
        in_5_portion = True
    else:  # in the 3' portion of r1
        if r2.qStart < MAX_QSTART_FOR_5:
            return False
        in_5_portion = False
    plus_is_5end = (r1.flag.strand == '+')

    type = compare_junctions.compare_junctions(r1, r2)
    if type == 'exact':
        if len(r1.segments) == 1:
            if len(r2.segments) == 1:
                # single exon case, check fusion point is close enough
                if in_5_portion and plus_is_5end:
                    dist = abs(r1.sStart - r2.sStart)
                else:
                    dist = abs(r1.sEnd - r2.sEnd)
                return dist <= max_fusion_point_dist
            else:
                raise Exception, "Not possible case for multi-exon transcript and " + \
                        "single-exon transcript to be exact!"
        else:  # multi-exon case, must be OK
            return True
    elif type == 'super' or type == 'subset':
        if allow_extra_5_exons:
            # check that the 3' junction is identical
            # also check that the 3' end is relatively close
            if in_5_portion and plus_is_5end:
                if r1.segments[-1].start != r2.segments[-1].start: return False
                if abs(r1.segments[-1].end -
                       r2.segments[-1].end) > max_fusion_point_dist:
                    return False
            elif in_5_portion and (not plus_is_5end):
                if r1.segments[0].end != r2.segments[0].end: return False
                if abs(r1.segments[0].start -
                       r2.segments[0].start) > max_fusion_point_dist:
                    return False
            else:
                return False
        else:  # not OK because number of exons must be the same
            return False
    else:  #ex: partial, nomatch, etc...
        return False
Exemplo n.º 2
0
def is_fusion_compatible(r1, r2, max_fusion_point_dist, max_exon_end_dist, allow_extra_5_exons):
    """
    Helper function for: merge_fusion_exons()

    Check that:
    (1) r1, r2 and both in the 5', or both in the 3'
    (2) if single-exon, fusion point must be close by
        if multi-exon, every junction identical (plus below is True)
    (3) if allow_extra_5_exons is False, num exons must be the same
        if allow_extra_5_exons is True, only allow additional 5' exons
    """
#    _ids = 'i1a_c1603/f67p459/1248,i1b_c19881/f7p368/1235,newClontech_i0HQ|c18279/f6p24/1229,i2b_c22046/f2p494/2157,i2a_c4714/f10p554/2152'.split(',')
#    if r1.qID in _ids or r2.qID in _ids: 
#        pdb.set_trace()
    # first need to figure out ends
    # also check that both are in the 5' portion of r1 and r2
    assert r1.flag.strand == r2.flag.strand
    if r1.qStart <= .5*r1.qLen: # in the 5' portion of r1
        if r2.qStart > .5*r2.qLen: # in the 3' portion, reject
            return False
        in_5_portion = True
    else: # in the 3' portion of r1
        if r2.qStart <= .5*r2.qLen: 
            return False
        in_5_portion = False
    plus_is_5end = (r1.flag.strand == '+')

    type = compare_junctions.compare_junctions(r1, r2)
    if type == 'exact':
        if len(r1.segments) == 1:
            if len(r2.segments) == 1:
                # single exon case, check fusion point is close enough
                if in_5_portion and plus_is_5end: dist = abs(r1.sStart - r2.sStart)
                else: dist = abs(r1.sEnd - r2.sEnd)
                return dist <= max_fusion_point_dist
            else:
                raise Exception, "Not possible case for multi-exon transcript and " + \
                        "single-exon transcript to be exact!"
        else: # multi-exon case, must be OK
            return True
    elif type == 'super' or type == 'subset':
        if allow_extra_5_exons:
            # check that the 3' junction is identical
            # also check that the 3' end is relatively close
            if in_5_portion and plus_is_5end:
                if abs(r1.segments[-1].start - r2.segments[-1].start) > max_exon_end_dist: return False
                if abs(r1.segments[-1].end - r2.segments[-1].end) > max_fusion_point_dist: return False
                return True
            elif in_5_portion and (not plus_is_5end):
                if abs(r1.segments[0].end - r2.segments[0].end) > max_exon_end_dist: return False
                if abs(r1.segments[0].start - r2.segments[0].start) > max_fusion_point_dist: return False
                return True
            else:
                return False
        else: # not OK because number of exons must be the same
            return False
    else: #ex: partial, nomatch, etc...
        return False
    def match_record_to_tree(self, r):
        """
        r --- GMAPRecord
        tree --- dict of chromosome --> strand --> IntervalTree

        If exact match (every exon junction), return the matching GMAPRecord
        Otherwise return None
        *NOTE*: the tree should be non-redundant so can return as soon as exact match is found!
        """
        matches = self.tree[r.chr][r.strand].find(r.start, r.end)
        for r2 in matches:
            r.segments = r.ref_exons
            r2.segments = r2.ref_exons
            if compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=self.internal_fuzzy_max_dist) == 'exact': # is a match!
                return r2
        return None
    def match_record_to_tree(self, r):
        """
        r --- GMAPRecord
        tree --- dict of chromosome --> strand --> IntervalTree

        If exact match (every exon junction), return the matching GMAPRecord
        Otherwise return None
        *NOTE*: the tree should be non-redundant so can return as soon as exact match is found!
        """
        matches = self.tree[r.chr][r.strand].find(r.start, r.end)
        for r2 in matches:
            r.segments = r.ref_exons
            r2.segments = r2.ref_exons
            if compare_junctions.compare_junctions(
                    r, r2, internal_fuzzy_max_dist=self.internal_fuzzy_max_dist
            ) == 'exact':  # is a match!
                return r2
        return None
def filter_out_subsets(recs, internal_fuzzy_max_dist):
    # recs must be sorted by start becuz that's the order they are written
    i = 0
    while i < len(recs)-1:
        j = i + 1
        while j < len(recs):
            if recs[j].start > recs[i].end: 
                break
            recs[i].segments = recs[i].ref_exons
            recs[j].segments = recs[j].ref_exons
            m = compare_junctions.compare_junctions(recs[i], recs[j], internal_fuzzy_max_dist)
            if can_merge(m, recs[i], recs[j], internal_fuzzy_max_dist):
                if m == 'super': # pop recs[j] 
                    recs.pop(j)
                else:
                    recs.pop(i)
                    j += 1
            else:
                j += 1
        i += 1
Exemplo n.º 6
0
def is_fusion_compatible(r1, r2, max_fusion_point_dist, max_exon_end_dist,
                         allow_extra_5_exons):
    """
    Helper function for: merge_fusion_exons()

    Check that:
    (1) r1, r2 and both in the 5', or both in the 3'
    (2) if single-exon, fusion point must be close by
        if multi-exon, every junction identical (plus below is True)
    (3) if allow_extra_5_exons is False, num exons must be the same
        if allow_extra_5_exons is True, only allow additional 5' exons
    """
    #    _ids = 'i1a_c1603/f67p459/1248,i1b_c19881/f7p368/1235,newClontech_i0HQ|c18279/f6p24/1229,i2b_c22046/f2p494/2157,i2a_c4714/f10p554/2152'.split(',')
    #    if r1.qID in _ids or r2.qID in _ids:
    #        pdb.set_trace()
    # first need to figure out ends
    # also check that both are in the 5' portion of r1 and r2
    assert r1.flag.strand == r2.flag.strand
    if r1.qStart <= .5 * r1.qLen:  # in the 5' portion of r1
        if r2.qStart > .5 * r2.qLen:  # in the 3' portion, reject
            return False
        in_5_portion = True
    else:  # in the 3' portion of r1
        if r2.qStart <= .5 * r2.qLen:
            return False
        in_5_portion = False
    plus_is_5end = (r1.flag.strand == '+')

    type = compare_junctions.compare_junctions(r1, r2)
    if type == 'exact':
        if len(r1.segments) == 1:
            if len(r2.segments) == 1:
                # single exon case, check fusion point is close enough
                if in_5_portion and plus_is_5end:
                    dist = abs(r1.sStart - r2.sStart)
                else:
                    dist = abs(r1.sEnd - r2.sEnd)
                return dist <= max_fusion_point_dist
            else:
                raise Exception, "Not possible case for multi-exon transcript and " + \
                        "single-exon transcript to be exact!"
        else:  # multi-exon case, must be OK
            return True
    elif type == 'super' or type == 'subset':
        if allow_extra_5_exons:
            # check that the 3' junction is identical
            # also check that the 3' end is relatively close
            if in_5_portion and plus_is_5end:
                if abs(r1.segments[-1].start -
                       r2.segments[-1].start) > max_exon_end_dist:
                    return False
                if abs(r1.segments[-1].end -
                       r2.segments[-1].end) > max_fusion_point_dist:
                    return False
                return True
            elif in_5_portion and (not plus_is_5end):
                if abs(r1.segments[0].end -
                       r2.segments[0].end) > max_exon_end_dist:
                    return False
                if abs(r1.segments[0].start -
                       r2.segments[0].start) > max_fusion_point_dist:
                    return False
                return True
            else:
                return False
        else:  # not OK because number of exons must be the same
            return False
    else:  #ex: partial, nomatch, etc...
        return False
Exemplo n.º 7
0
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon,
                             internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
        return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1  #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(
                r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename + '.fuzzy', 'w')
    f_group = open(group_filename + '.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(
            group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                               and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
        return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1 #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename+'.fuzzy', 'w')
    f_group = open(group_filename+'.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match