def merge_orfs_with_two_tinyexons(preceding_orf,subsequent_orf, preceding_donor_sites=[], subsequent_acceptor_sites=[], orflist=[],**kwargs): """ Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors @type preceding_orf: Orf object @param preceding_orf: Orf object that contains preceding_donor_site(s) @type subsequent_orf: Orf object @param subsequent_orf: Orf object that contains subsequent_acceptor_site(s) @type preceding_donor_sites: list @param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects @type subsequent_acceptor_sites: list @param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects @type orflist: list @param orflist: list with Orf objects @attention: see get_potential_tiny_exons_on_orf for additional **kwargs @rtype: list @return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron ) """ if not preceding_donor_sites: return [] if not subsequent_acceptor_sites: return [] if not orflist: return [] # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON) # return list with (intron,tinyexon,intron) tuples returntinyexons = [] tinyexoncollection = [] tinyexoncombis = [] min_preceding_donor_sites_pos = min([ d.pos for d in preceding_donor_sites ]) max_subsequent_acceptor_sites_pos = max([ a.pos for a in subsequent_acceptor_sites ]) for orfX in orflist: # check if orf is correctly positions towards the splice sites' extremes min_pos = min_preceding_donor_sites_pos + kwargs['min_tinyexon_intron_nt_length'] max_pos = max_subsequent_acceptor_sites_pos - kwargs['min_tinyexon_intron_nt_length'] # if so, do not check this Orf if orfX.endPY <= min_pos: continue if orfX.startPY >= max_pos: continue # extend the tinyexoncollection tinyexoncollection.extend( get_potential_tiny_exons_on_orf(orfX,**kwargs) ) # make tinyexoncollection ordered on start pos tinyexoncollection = _order_intron_list(tinyexoncollection,order_by='donor_pos') # donor_pos makes REVERSE ordering; restore this by reversing tinyexoncollection.reverse() # make 2-elemented tuples of tinyexons which can co-occur together for tinyexon1 in tinyexoncollection: for pos in range(len(tinyexoncollection)-1,-1,-1): tinyexon2 = tinyexoncollection[pos] if tinyexon2.donor.pos < tinyexon1.donor.pos: break intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos if intron_length < kwargs['min_tinyexon_intron_nt_length']: continue if intron_length > kwargs['max_tinyexon_intron_nt_length']: continue if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue # if here, elegiable combi! intron = IntronConnectingOrfs( tinyexon1.donor,tinyexon2.acceptor, get_shared_nucleotides_at_splicesite( subsequent_orf,preceding_orf, tinyexon2.acceptor,tinyexon1.donor ), preceding_orf,subsequent_orf) totlen = tinyexon1.length+tinyexon2.length combi = ( totlen, tinyexon1, intron, tinyexon2 ) tinyexoncombis.append( combi ) # return an ordered list based on length tinyexoncombis.sort() return [ (exon1,intron,exon2) for l,exon1,intron,exon2 in tinyexoncombis ]
def merge_orfs_with_intron(orfD,orfA, max_intron_nt_length = MAX_INTRON_NT_LENGTH, min_intron_nt_length = MIN_INTRON_NT_LENGTH, min_donor_pssm_score = MIN_DONOR_PSSM_SCORE, min_acceptor_pssm_score = MIN_ACCEPTOR_PSSM_SCORE, allow_non_canonical_donor = ALLOW_NON_CANONICAL_DONOR, allow_non_canonical_acceptor = ALLOW_NON_CANONICAL_ACCEPTOR, non_canonical_min_donor_pssm_score = NON_CANONICAL_MIN_DONOR_PSSM_SCORE, non_canonical_min_acceptor_pssm_score = NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE, min_donor_pos=None, max_donor_pos=None, min_acceptor_pos=None, max_acceptor_pos=None, order_by = 'length',**kwargs): """ Merge 2 Orf objects by introns @attention: **kwargs can contain other (here) unnecessarily arguments @type orfD: Orf object @param orfD: Orf object that has to deliver a PSSM donor object @type orfA: Orf object @param orfA: Orf object that has to deliver a PSSM acceptor object @type max_intron_nt_length: integer @param max_intron_nt_length: maximal length (nt) of the intron @type min_intron_nt_length: integer @param min_intron_nt_length: minimal length (nt) of the intron @type min_donor_pssm_score: float @param min_donor_pssm_score: minimal pssm score of donor splice site @type min_acceptor_pssm_score: float @param min_acceptor_pssm_score: minimal pssm score of acceptor splice site @type allow_non_canonical_donor: Boolean @param allow_non_canonical_donor: search for non-canonical donor sites too @type allow_non_canonical_acceptor: Boolean @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too @type non_canonical_min_donor_pssm_score: float @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor @type non_canonical_min_acceptor_pssm_score: float @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor @rtype: list @return: list with introns """ # input validation IsOrf(orfD) IsOrf(orfA) # scan for splice sites (if not already done -> is checked in function) orfD.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=min_donor_pssm_score, allow_non_canonical=allow_non_canonical_donor, non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score) orfA.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=min_acceptor_pssm_score, allow_non_canonical=allow_non_canonical_acceptor, non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score) # return list with introns introns = [] # most quickest scan possible: are there donors & acceptors? if orfD._donor_sites == [] or orfA._acceptor_sites == []: # no introns possible because splice sites are missing return introns # very quick scan: are exons not to far from each other? if max_intron_nt_length and\ (orfA._acceptor_sites[0].pos - orfD._donor_sites[0].pos) > max_intron_nt_length: # no introns possible that can bridge this gap return introns for donor in orfD._donor_sites: if not allow_non_canonical_donor and not donor.is_canonical(): continue elif donor.is_canonical() and donor.pssm_score < min_donor_pssm_score: continue elif not donor.is_canonical() and donor.pssm_score < non_canonical_min_donor_pssm_score: continue elif (min_donor_pos or min_donor_pos==0) and donor.pos < min_donor_pos: continue elif (max_donor_pos or max_donor_pos==0) and donor.pos > max_donor_pos: continue else: # donor site accepted pass for acceptor in orfA._acceptor_sites: if not allow_non_canonical_acceptor and not acceptor.is_canonical(): continue elif acceptor.is_canonical() and acceptor.pssm_score < min_acceptor_pssm_score: continue elif not acceptor.is_canonical() and acceptor.pssm_score < non_canonical_min_acceptor_pssm_score: continue elif (min_acceptor_pos or min_acceptor_pos==0) and acceptor.pos < min_acceptor_pos: continue elif (max_acceptor_pos or max_acceptor_pos==0) and acceptor.pos > max_acceptor_pos: continue else: # acceptor site accepted pass # generate intron length and phase variable intron_length = acceptor.pos - donor.pos intron_phase = intron_length % 3 # check phase compatibilty (1) of splice sites if donor.phase != acceptor.phase: continue # check phase compatibilty (2) of splice sites if ( intron_phase + orfD.frame ) % 3 != orfA.frame % 3: continue # check if intron length is in between the boundaries if max_intron_nt_length and intron_length > max_intron_nt_length: continue if min_intron_nt_length and intron_length < min_intron_nt_length: continue # okay, if we reach this point, we have a valid intron shared_nts = get_shared_nucleotides_at_splicesite( orfA,orfD,acceptor,donor ) # make a IntronConnectingOrfs object intron = IntronConnectingOrfs(donor,acceptor,shared_nts,orfD,orfA) introns.append(intron) # return ordered intron list return _order_intron_list(introns,order_by=order_by)
def merge_orfs_with_tinyexon(preceding_orf,subsequent_orf, preceding_donor_sites=[], subsequent_acceptor_sites=[], orflist=[], max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH, min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH, max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH, min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH, min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE, min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE, min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE, **kwargs): """ Bridge two `neighbouring` Orfs by a tinyexon by applying preceding donors and subsequent acceptors @type preceding_orf: Orf object @param preceding_orf: Orf object that contains preceding_donor_site(s) @type subsequent_orf: Orf object @param subsequent_orf: Orf object that contains subsequent_acceptor_site(s) @type preceding_donor_sites: list @param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects @type subsequent_acceptor_sites: list @param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects @type orflist: list @param orflist: list with Orf objects @type max_tinyexon_nt_length: integer @param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt @type min_tinyexon_nt_length: integer @param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt @type max_tinyexon_intron_nt_length: integer @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt @type min_tinyexon_intron_nt_length: integer @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt @type min_total_pssm_score: float or None @param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon @type min_donor_pssm_score: float or None @param min_donor_pssm_score: minimal donor pssm score of tinyexon @type min_acceptor_pssm_score: float or None @param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon @rtype: list @return: list of tuples ( preceding_intron, tinyexon, subsequent_intron ) @attention: Global vars that have to be set upon usage: MIN_DONOR_PSSM_SCORE MIN_ACCEPTOR_PSSM_SCORE # and all TINYEXON variable named TINYEXON_MAX_NT_LENGTH TINYEXON_MIN_NT_LENGTH TINYEXON_MAX_INTRON_NT_LENGTH TINYEXON_MIN_INTRON_NT_LENGTH TINYEXON_MIN_PSSM_SCORE TINYEXON_MIN_DONOR_PSSM_SCORE TINYEXON_MIN_ACCEPTOR_PSSM_SCORE TINYEXON_ALLOW_NON_CANONICAL_DONOR TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR TINYEXON_NON_CANONICAL_MIN_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE """ if not preceding_donor_sites: return [] if not subsequent_acceptor_sites: return [] if not orflist: return [] # return list with (intron,tinyexon,intron) tuples returnexons = [] min_preceding_donor_sites_pos = min([ d.pos for d in preceding_donor_sites ]) max_subsequent_acceptor_sites_pos = max([ a.pos for a in subsequent_acceptor_sites ]) for orfX in orflist: # check if orf is correctly positions towards the splice sites' extremes if orfX.endPY <= min_preceding_donor_sites_pos: continue if orfX.startPY >= max_subsequent_acceptor_sites_pos: continue # if here, we can try to make a bridge by a tinyexon for donor in preceding_donor_sites: # orf not correctly positions towards the donor site if orfX.endPY <= donor.pos: continue # check pssm_score of donor site # TODO: this is in fact the donor on the normal, large orf # TODO: do we want to check this pssm score? if donor.pssm_score < min_donor_pssm_score: continue for acceptor in subsequent_acceptor_sites: if orfX.startPY >= acceptor.pos: continue # check pssm_score of acceptor site # TODO: this is in fact the acceptor on the normal, large orf # TODO: do we want to check this pssm score? if acceptor.pssm_score < min_acceptor_pssm_score: continue # okay, now try to bridge it! exons = find_tiny_exon_on_orf(orfX,order_by='total_pssm', max_tinyexon_nt_length=max_tinyexon_nt_length, min_tinyexon_nt_length=min_tinyexon_nt_length, max_tinyexon_intron_nt_length=max_tinyexon_intron_nt_length, min_tinyexon_intron_nt_length=min_tinyexon_intron_nt_length, min_donor_pssm_score=min_donor_pssm_score, min_acceptor_pssm_score=min_acceptor_pssm_score, min_total_pssm_score=min_total_pssm_score, preceding_donor=donor, subsequent_acceptor=acceptor ) # and append to returnexons for tinyexon in exons: # make preceding intron shared_nts_A = get_shared_nucleotides_at_splicesite( tinyexon.orf,preceding_orf, tinyexon.acceptor,donor ) preceding_intron = IntronConnectingOrfs( donor,tinyexon.acceptor, shared_nts_A,preceding_orf,tinyexon.orf ) # make subsequent intron shared_nts_B = get_shared_nucleotides_at_splicesite( subsequent_orf,tinyexon.orf, acceptor,tinyexon.donor ) subsequent_intron = IntronConnectingOrfs( tinyexon.donor, acceptor, shared_nts_B,tinyexon.orf,subsequent_orf ) # and append to exons returnexons.append( ( preceding_intron, tinyexon, subsequent_intron ) ) # and return the list of intron/exon/intron return returnexons
def merge_orfs_with_two_tinyexons(preceding_orf, subsequent_orf, preceding_donor_sites=[], subsequent_acceptor_sites=[], orflist=[], **kwargs): """ Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors @type preceding_orf: Orf object @param preceding_orf: Orf object that contains preceding_donor_site(s) @type subsequent_orf: Orf object @param subsequent_orf: Orf object that contains subsequent_acceptor_site(s) @type preceding_donor_sites: list @param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects @type subsequent_acceptor_sites: list @param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects @type orflist: list @param orflist: list with Orf objects @attention: see get_potential_tiny_exons_on_orf for additional **kwargs @rtype: list @return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron ) """ if not preceding_donor_sites: return [] if not subsequent_acceptor_sites: return [] if not orflist: return [] # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) # return list with (intron,tinyexon,intron) tuples returntinyexons = [] tinyexoncollection = [] tinyexoncombis = [] min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites]) max_subsequent_acceptor_sites_pos = max( [a.pos for a in subsequent_acceptor_sites]) for orfX in orflist: # check if orf is correctly positions towards the splice sites' extremes min_pos = min_preceding_donor_sites_pos + kwargs[ 'min_tinyexon_intron_nt_length'] max_pos = max_subsequent_acceptor_sites_pos - kwargs[ 'min_tinyexon_intron_nt_length'] # if so, do not check this Orf if orfX.endPY <= min_pos: continue if orfX.startPY >= max_pos: continue # extend the tinyexoncollection tinyexoncollection.extend( get_potential_tiny_exons_on_orf(orfX, **kwargs)) # make tinyexoncollection ordered on start pos tinyexoncollection = _order_intron_list(tinyexoncollection, order_by='donor_pos') # donor_pos makes REVERSE ordering; restore this by reversing tinyexoncollection.reverse() # make 2-elemented tuples of tinyexons which can co-occur together for tinyexon1 in tinyexoncollection: for pos in range(len(tinyexoncollection) - 1, -1, -1): tinyexon2 = tinyexoncollection[pos] if tinyexon2.donor.pos < tinyexon1.donor.pos: break intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos if intron_length < kwargs['min_tinyexon_intron_nt_length']: continue if intron_length > kwargs['max_tinyexon_intron_nt_length']: continue if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue # if here, elegiable combi! intron = IntronConnectingOrfs( tinyexon1.donor, tinyexon2.acceptor, get_shared_nucleotides_at_splicesite(subsequent_orf, preceding_orf, tinyexon2.acceptor, tinyexon1.donor), preceding_orf, subsequent_orf) totlen = tinyexon1.length + tinyexon2.length combi = (totlen, tinyexon1, intron, tinyexon2) tinyexoncombis.append(combi) # return an ordered list based on length tinyexoncombis.sort() return [(exon1, intron, exon2) for l, exon1, intron, exon2 in tinyexoncombis]
def merge_orfs_with_intron( orfD, orfA, max_intron_nt_length=MAX_INTRON_NT_LENGTH, min_intron_nt_length=MIN_INTRON_NT_LENGTH, min_donor_pssm_score=MIN_DONOR_PSSM_SCORE, min_acceptor_pssm_score=MIN_ACCEPTOR_PSSM_SCORE, allow_non_canonical_donor=ALLOW_NON_CANONICAL_DONOR, allow_non_canonical_acceptor=ALLOW_NON_CANONICAL_ACCEPTOR, non_canonical_min_donor_pssm_score=NON_CANONICAL_MIN_DONOR_PSSM_SCORE, non_canonical_min_acceptor_pssm_score=NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE, min_donor_pos=None, max_donor_pos=None, min_acceptor_pos=None, max_acceptor_pos=None, order_by='length', **kwargs): """ Merge 2 Orf objects by introns @attention: **kwargs can contain other (here) unnecessarily arguments @type orfD: Orf object @param orfD: Orf object that has to deliver a PSSM donor object @type orfA: Orf object @param orfA: Orf object that has to deliver a PSSM acceptor object @type max_intron_nt_length: integer @param max_intron_nt_length: maximal length (nt) of the intron @type min_intron_nt_length: integer @param min_intron_nt_length: minimal length (nt) of the intron @type min_donor_pssm_score: float @param min_donor_pssm_score: minimal pssm score of donor splice site @type min_acceptor_pssm_score: float @param min_acceptor_pssm_score: minimal pssm score of acceptor splice site @type allow_non_canonical_donor: Boolean @param allow_non_canonical_donor: search for non-canonical donor sites too @type allow_non_canonical_acceptor: Boolean @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too @type non_canonical_min_donor_pssm_score: float @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor @type non_canonical_min_acceptor_pssm_score: float @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor @rtype: list @return: list with introns """ # input validation IsOrf(orfD) IsOrf(orfA) # scan for splice sites (if not already done -> is checked in function) orfD.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=min_donor_pssm_score, allow_non_canonical=allow_non_canonical_donor, non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score) orfA.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=min_acceptor_pssm_score, allow_non_canonical=allow_non_canonical_acceptor, non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score) # return list with introns introns = [] # most quickest scan possible: are there donors & acceptors? if orfD._donor_sites == [] or orfA._acceptor_sites == []: # no introns possible because splice sites are missing return introns # very quick scan: are exons not to far from each other? if max_intron_nt_length and\ (orfA._acceptor_sites[0].pos - orfD._donor_sites[0].pos) > max_intron_nt_length: # no introns possible that can bridge this gap return introns for donor in orfD._donor_sites: if not allow_non_canonical_donor and not donor.is_canonical(): continue elif donor.is_canonical() and donor.pssm_score < min_donor_pssm_score: continue elif not donor.is_canonical( ) and donor.pssm_score < non_canonical_min_donor_pssm_score: continue elif (min_donor_pos or min_donor_pos == 0) and donor.pos < min_donor_pos: continue elif (max_donor_pos or max_donor_pos == 0) and donor.pos > max_donor_pos: continue else: # donor site accepted pass for acceptor in orfA._acceptor_sites: if not allow_non_canonical_acceptor and not acceptor.is_canonical( ): continue elif acceptor.is_canonical( ) and acceptor.pssm_score < min_acceptor_pssm_score: continue elif not acceptor.is_canonical( ) and acceptor.pssm_score < non_canonical_min_acceptor_pssm_score: continue elif (min_acceptor_pos or min_acceptor_pos == 0) and acceptor.pos < min_acceptor_pos: continue elif (max_acceptor_pos or max_acceptor_pos == 0) and acceptor.pos > max_acceptor_pos: continue else: # acceptor site accepted pass # generate intron length and phase variable intron_length = acceptor.pos - donor.pos intron_phase = intron_length % 3 # check phase compatibilty (1) of splice sites if donor.phase != acceptor.phase: continue # check phase compatibilty (2) of splice sites if (intron_phase + orfD.frame) % 3 != orfA.frame % 3: continue # check if intron length is in between the boundaries if max_intron_nt_length and intron_length > max_intron_nt_length: continue if min_intron_nt_length and intron_length < min_intron_nt_length: continue # okay, if we reach this point, we have a valid intron shared_nts = get_shared_nucleotides_at_splicesite( orfA, orfD, acceptor, donor) # make a IntronConnectingOrfs object intron = IntronConnectingOrfs(donor, acceptor, shared_nts, orfD, orfA) introns.append(intron) # return ordered intron list return _order_intron_list(introns, order_by=order_by)
def merge_orfs_with_tinyexon( preceding_orf, subsequent_orf, preceding_donor_sites=[], subsequent_acceptor_sites=[], orflist=[], max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH, min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH, max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH, min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH, min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE, min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE, min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE, **kwargs): """ Bridge two `neighbouring` Orfs by a tinyexon by applying preceding donors and subsequent acceptors @type preceding_orf: Orf object @param preceding_orf: Orf object that contains preceding_donor_site(s) @type subsequent_orf: Orf object @param subsequent_orf: Orf object that contains subsequent_acceptor_site(s) @type preceding_donor_sites: list @param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects @type subsequent_acceptor_sites: list @param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects @type orflist: list @param orflist: list with Orf objects @type max_tinyexon_nt_length: integer @param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt @type min_tinyexon_nt_length: integer @param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt @type max_tinyexon_intron_nt_length: integer @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt @type min_tinyexon_intron_nt_length: integer @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt @type min_total_pssm_score: float or None @param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon @type min_donor_pssm_score: float or None @param min_donor_pssm_score: minimal donor pssm score of tinyexon @type min_acceptor_pssm_score: float or None @param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon @rtype: list @return: list of tuples ( preceding_intron, tinyexon, subsequent_intron ) @attention: Global vars that have to be set upon usage: MIN_DONOR_PSSM_SCORE MIN_ACCEPTOR_PSSM_SCORE # and all TINYEXON variable named TINYEXON_MAX_NT_LENGTH TINYEXON_MIN_NT_LENGTH TINYEXON_MAX_INTRON_NT_LENGTH TINYEXON_MIN_INTRON_NT_LENGTH TINYEXON_MIN_PSSM_SCORE TINYEXON_MIN_DONOR_PSSM_SCORE TINYEXON_MIN_ACCEPTOR_PSSM_SCORE TINYEXON_ALLOW_NON_CANONICAL_DONOR TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR TINYEXON_NON_CANONICAL_MIN_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE """ if not preceding_donor_sites: return [] if not subsequent_acceptor_sites: return [] if not orflist: return [] # return list with (intron,tinyexon,intron) tuples returnexons = [] min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites]) max_subsequent_acceptor_sites_pos = max( [a.pos for a in subsequent_acceptor_sites]) for orfX in orflist: # check if orf is correctly positions towards the splice sites' extremes if orfX.endPY <= min_preceding_donor_sites_pos: continue if orfX.startPY >= max_subsequent_acceptor_sites_pos: continue # if here, we can try to make a bridge by a tinyexon for donor in preceding_donor_sites: # orf not correctly positions towards the donor site if orfX.endPY <= donor.pos: continue # check pssm_score of donor site # TODO: this is in fact the donor on the normal, large orf # TODO: do we want to check this pssm score? if donor.pssm_score < min_donor_pssm_score: continue for acceptor in subsequent_acceptor_sites: if orfX.startPY >= acceptor.pos: continue # check pssm_score of acceptor site # TODO: this is in fact the acceptor on the normal, large orf # TODO: do we want to check this pssm score? if acceptor.pssm_score < min_acceptor_pssm_score: continue # okay, now try to bridge it! exons = find_tiny_exon_on_orf( orfX, order_by='total_pssm', max_tinyexon_nt_length=max_tinyexon_nt_length, min_tinyexon_nt_length=min_tinyexon_nt_length, max_tinyexon_intron_nt_length=max_tinyexon_intron_nt_length, min_tinyexon_intron_nt_length=min_tinyexon_intron_nt_length, min_donor_pssm_score=min_donor_pssm_score, min_acceptor_pssm_score=min_acceptor_pssm_score, min_total_pssm_score=min_total_pssm_score, preceding_donor=donor, subsequent_acceptor=acceptor) # and append to returnexons for tinyexon in exons: # make preceding intron shared_nts_A = get_shared_nucleotides_at_splicesite( tinyexon.orf, preceding_orf, tinyexon.acceptor, donor) preceding_intron = IntronConnectingOrfs( donor, tinyexon.acceptor, shared_nts_A, preceding_orf, tinyexon.orf) # make subsequent intron shared_nts_B = get_shared_nucleotides_at_splicesite( subsequent_orf, tinyexon.orf, acceptor, tinyexon.donor) subsequent_intron = IntronConnectingOrfs( tinyexon.donor, acceptor, shared_nts_B, tinyexon.orf, subsequent_orf) # and append to exons returnexons.append( (preceding_intron, tinyexon, subsequent_intron)) # and return the list of intron/exon/intron return returnexons