def bridge_two_pacbporfs_by_tinyexon(preceding_orf,subsequent_orf, preceding_donor_sites=[], subsequent_acceptor_sites=[], orflist=[], max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH, min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH, max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH, min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH, min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE, min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE, min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE): """ Bridge two `neighbouring` Orfs by a tinyexon by applying preceding donors and subsequent acceptors @type preceding_orf: Orf object @param preceding_orf: Orf object that contains preceding_donor_site(s) @type subsequent_orf: Orf object @param subsequent_orf: Orf object that contains subsequent_acceptor_site(s) @type preceding_donor_sites: list @param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects @type subsequent_acceptor_sites: list @param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects @type orflist: list @param orflist: list with Orf objects @type max_tinyexon_nt_length: integer @param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt @type min_tinyexon_nt_length: integer @param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt @type max_tinyexon_intron_nt_length: integer @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt @type min_tinyexon_intron_nt_length: integer @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt @type min_total_pssm_score: float or None @param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon @type min_donor_pssm_score: float or None @param min_donor_pssm_score: minimal donor pssm score of tinyexon @type min_acceptor_pssm_score: float or None @param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon @rtype: list @return: list of tuples ( preceding_intron, tinyexon, subsequent_intron ) @attention: Global vars that have to be set upon usage: MIN_DONOR_PSSM_SCORE MIN_ACCEPTOR_PSSM_SCORE # and all TINYEXON variable named TINYEXON_MAX_NT_LENGTH TINYEXON_MIN_NT_LENGTH TINYEXON_MAX_INTRON_NT_LENGTH TINYEXON_MIN_INTRON_NT_LENGTH TINYEXON_MIN_PSSM_SCORE TINYEXON_MIN_DONOR_PSSM_SCORE TINYEXON_MIN_ACCEPTOR_PSSM_SCORE TINYEXON_ALLOW_NON_CANONICAL_DONOR TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR TINYEXON_NON_CANONICAL_MIN_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE """ if not preceding_donor_sites: return [] if not subsequent_acceptor_sites: return [] if not orflist: return [] # return dictionary with exon coordinates (keys) and exons/introns returnexons = {} min_preceding_donor_sites_pos = min([ d.pos for d in preceding_donor_sites ]) max_subsequent_acceptor_sites_pos = max([ a.pos for a in subsequent_acceptor_sites ]) for orfX in orflist: # check if orf is correctly positions towards the splice sites' extremes if orfX.endPY <= min_preceding_donor_sites_pos: continue if orfX.startPY >= max_subsequent_acceptor_sites_pos: continue # if here, we can try to make a bridge by a tinyexon for donor in preceding_donor_sites: # orf not correctly positions towards the donor site if orfX.endPY <= donor.pos: continue # check pssm_score of donor site # TODO: this is in fact the donor on the normal, large orf # TODO: do we want to check this pssm score? if donor.pssm_score < min_donor_pssm_score: continue for acceptor in subsequent_acceptor_sites: if orfX.startPY >= acceptor.pos: continue # check pssm_score of acceptor site # TODO: this is in fact the acceptor on the normal, large orf # TODO: do we want to check this pssm score? if acceptor.pssm_score < min_acceptor_pssm_score: continue # okay, now try to bridge it! exons = find_tiny_exon_on_orf(orfX,order_by='total_pssm', max_tinyexon_nt_length=max_tinyexon_nt_length, min_tinyexon_nt_length=min_tinyexon_nt_length, max_tinyexon_intron_nt_length=max_tinyexon_intron_nt_length, min_tinyexon_intron_nt_length=min_tinyexon_intron_nt_length, min_donor_pssm_score=min_donor_pssm_score, min_acceptor_pssm_score=min_acceptor_pssm_score, min_total_pssm_score=min_total_pssm_score, preceding_donor=donor, subsequent_acceptor=acceptor ) # and append to returnexons for tinyexon in exons: # make preceding intron shared_nts_A = "TODO" preceding_intron = IntronConnectingOrfs( donor,tinyexon.acceptor, shared_nts_A,preceding_orf,tinyexon.orf ) # make subsequent intron shared_nts_B = "TODO" subsequent_intron = IntronConnectingOrfs( tinyexon.donor, acceptor, shared_nts_B,tinyexon.orf,subsequent_orf ) # and append to exons key = ( tinyexon.acceptor.pos, tinyexon.donor.pos ) #returnexons.append( ( preceding_intron, tinyexon, subsequent_intron ) ) if key not in returnexons.keys(): returnexons[key] = tinyexon # and return the list of intron/exon/intron return _order_intron_list( returnexons.values() )
def scan_orf_for_tiny_exon(orfX,order_by='total_pssm', max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH, min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH, min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE, min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE, min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE, allow_non_canonical_donor=TINYEXON_ALLOW_NON_CANONICAL_DONOR, allow_non_canonical_acceptor=TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR, min_intron_nt_length=None, max_intron_nt_length=None, donor_phase=None, acceptor_phase=None, preceeding_donor_site=None, subsequent_acceptor_site=None, min_acceptor_pos=None, max_donor_pos=None): """ Find tiny exons on an orf by length range @type orfX: Orf object @param orfX: Orf object to scan for a tinyexon @type max_tinyexon_nt_length: integer @param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt @type min_tinyexon_nt_length: integer @param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt @type min_total_pssm_score: float or None @param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon @type min_donor_pssm_score: float or None @param min_donor_pssm_score: minimal donor pssm score of tinyexon @type min_acceptor_pssm_score: float or None @param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon @type max_donor_pos: integer or None @param max_donor_pos: maximal elegiable donor position @type min_acceptor_pos: integer or None @param min_acceptor_pos: minimal elegiable acceptor position @type order_by: TODO @param order_by: TODO """ # scan for splice sites on this (tiny) orf orfX.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=min_donor_pssm_score, allow_non_canonical=TINYEXON_ALLOW_NON_CANONICAL_DONOR, non_canonical_min_pssm_score=TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE) orfX.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=min_acceptor_pssm_score, allow_non_canonical=TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR, non_canonical_min_pssm_score=TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE) # return list with exons exons = [] # most quickest scan possible: are there donors & acceptors? if orfX._donor_sites == [] or orfX._acceptor_sites == []: # no exons possible because splice sites are missing return exons # make a list of compatible_acceptor_sites compatible_acceptor_sites = [] for acceptor in orfX._acceptor_sites: if acceptor_phase in [0,1,2] and acceptor.phase != acceptor_phase: continue if acceptor.pssm_score < min_acceptor_pssm_score: continue if min_acceptor_pos and acceptor.pos < min_acceptor_pos: continue if preceeding_donor_site: if preceeding_donor_site.phase != acceptor.phase: continue if min_intron_nt_length and acceptor.pos - preceeding_donor_site.pos < min_intron_nt_length: continue if max_intron_nt_length and acceptor.pos - preceeding_donor_site.pos > max_intron_nt_length: continue # if we reach this point, compatible site! compatible_acceptor_sites.append( acceptor ) # make a list of compatible_donor_sites compatible_donor_sites = [] for donor in orfX._donor_sites: if donor_phase in [0,1,2] and donor.phase != donor_phase: continue if donor.pssm_score < min_donor_pssm_score: continue if max_donor_pos and donor.pos > max_donor_pos: continue if subsequent_acceptor_site: if subsequent_acceptor_site.phase != donor.phase: continue if min_intron_nt_length and subsequent_acceptor_site.pos - donor.pos < min_intron_nt_length: continue if max_intron_nt_length and subsequent_acceptor_site.pos - donor.pos > max_intron_nt_length: continue # if we reach this point, compatible site! compatible_donor_sites.append( donor ) ###print "lib_tinyexon, comp d & a:", len(compatible_donor_sites), len(compatible_acceptor_sites), "orf:", orfX.id, min_donor_pssm_score, min_acceptor_pssm_score # and combine sites to exons! for acceptor in compatible_acceptor_sites: for donor in compatible_donor_sites: # length of exon exon_length = donor.pos - acceptor.pos # continue if exon to short if exon_length < min_tinyexon_nt_length: continue # continue if exon to long if exon_length > max_tinyexon_nt_length: continue # check sum of donor and acceptor pssm score if (min_total_pssm_score or min_total_pssm_score==0.0) and\ donor.pssm_score + acceptor.pssm_score < min_total_pssm_score: continue # make a Exon object exon = ExonOnOrf(acceptor,donor,orfX) exons.append(exon) # return ordered exon list return _order_intron_list(exons,order_by=order_by)
def find_tiny_exon_on_orf(orfX,order_by='total_pssm', max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH, min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH, max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH, min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH, min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE, min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE, min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE, preceding_donor=None, subsequent_acceptor=None, preceding_donor_pos=None, subsequent_acceptor_pos=None): """ Find a tiny exon on an orf by a leading donor and a trailing acceptor site. @type orfX: Orf object @param orfX: Orf object to scan for a tinyexon @type preceding_donor: object @param preceding_donor: SpliceDonorGT or SpliceDonor object @type subsequent_acceptor: object @param subsequent_acceptor: SpliceAcceptorAG or SpliceAcceptor object @type max_tinyexon_nt_length: integer @param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt @type min_tinyexon_nt_length: integer @param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt @type max_tinyexon_intron_nt_length: integer @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt @type min_tinyexon_intron_nt_length: integer @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt @type min_total_pssm_score: float or None @param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon @type min_donor_pssm_score: float or None @param min_donor_pssm_score: minimal donor pssm score of tinyexon @type min_acceptor_pssm_score: float or None @param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon @type order_by: TODO @param order_by: TODO @attention: Global vars that have to be set upon usage: MIN_DONOR_PSSM_SCORE MIN_ACCEPTOR_PSSM_SCORE # and all TINYEXON variable named TINYEXON_MAX_NT_LENGTH TINYEXON_MIN_NT_LENGTH TINYEXON_MAX_INTRON_NT_LENGTH TINYEXON_MIN_INTRON_NT_LENGTH TINYEXON_MIN_PSSM_SCORE TINYEXON_MIN_DONOR_PSSM_SCORE TINYEXON_MIN_ACCEPTOR_PSSM_SCORE TINYEXON_ALLOW_NON_CANONICAL_DONOR TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR TINYEXON_NON_CANONICAL_MIN_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE """ # scan for splice sites on this (tiny) orf orfX.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE, allow_non_canonical=TINYEXON_ALLOW_NON_CANONICAL_DONOR, non_canonical_min_pssm_score=TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE) orfX.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE, allow_non_canonical=TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR, non_canonical_min_pssm_score=TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE) # return list with exons exons = [] # do some input data processing on preceding_donor if preceding_donor == None: # preceding donor MUST be set! message = "preceding_donor is not a `SpliceDonorGT` or `SpliceDonor` object" raise InproperlyAppliedArgument, message elif preceding_donor.__class__.__name__ in ['SpliceDonorGT','SpliceDonor']: pass else: message = "preceding_donor is not a `SpliceDonorGT` or `SpliceDonor` object, but a `%s`" % preceding_donor.__class__.__name__ raise InproperlyAppliedArgument, message # do some input data processing on subsequent_acceptor if subsequent_acceptor == None: # subsequent acceptor MUST be set message = "subsequent_acceptor is not a `SpliceAcceptorAG` or `SpliceAcceptor` object" raise InproperlyAppliedArgument, message elif subsequent_acceptor.__class__.__name__ in ['SpliceAcceptorAG','SpliceAcceptor']: pass else: message = "subsequent_acceptor is not a `SpliceAcceptorAG` or `SpliceAcceptor` object, but a `%s`" % subsequent_acceptor.__class__.__name__ raise InproperlyAppliedArgument, message # check phases of acceptor and donor if subsequent_acceptor.phase not in [0,1,2]: raise UnexpectedSpliceSitePhase if preceding_donor.phase not in [0,1,2]: raise UnexpectedSpliceSitePhase # some further integrity check on integer arguments for variable in ( max_tinyexon_nt_length, min_tinyexon_nt_length, max_tinyexon_intron_nt_length, min_tinyexon_intron_nt_length): try: variable = int(variable) if variable <= 0: raise except: message = "a variable is NOT a positive integer as expected" raise InproperlyAppliedArgument, message # most quickest scan possible: are there donors & acceptors? if orfX._donor_sites == [] or orfX._acceptor_sites == []: # no exons possible because splice sites are missing return exons # make a list of compatible_acceptor_sites compatible_acceptor_sites = [] for acceptor in orfX._acceptor_sites: # TODO: check! do we need a combi of donor and acceptor or acceptor and acceptor? if acceptor.phase != preceding_donor.phase: continue if acceptor.pssm_score < min_acceptor_pssm_score: continue if acceptor.pos - preceding_donor.pos < min_tinyexon_intron_nt_length: # intron to short continue if acceptor.pos - preceding_donor.pos > max_tinyexon_intron_nt_length: # intron to long continue # if we reach this point, compatible site! compatible_acceptor_sites.append( acceptor ) # make a list of compatible_donor_sites compatible_donor_sites = [] for donor in orfX._donor_sites: # TODO: check! do we need a combi of donor and acceptor or donor and donor? if donor.phase != subsequent_acceptor.phase: continue if donor.pssm_score < min_donor_pssm_score: continue if subsequent_acceptor.pos - donor.pos > max_tinyexon_intron_nt_length: # intron to long continue if subsequent_acceptor.pos - donor.pos < min_tinyexon_intron_nt_length: # intron to short continue # if we reach this point, compatible site! compatible_donor_sites.append( donor ) # and combine sites to exons! for acceptor in compatible_acceptor_sites: for donor in compatible_donor_sites: # length of exon exon_length = donor.pos - acceptor.pos # continue if exon to short if exon_length < min_tinyexon_nt_length: continue # continue if exon to long if exon_length > max_tinyexon_nt_length: continue # check sum of donor and acceptor pssm score if (min_total_pssm_score or min_total_pssm_score==0.0) and\ donor.pssm_score + acceptor.pssm_score < min_total_pssm_score: continue # make a Exon object exon = ExonOnOrf(acceptor,donor,orfX) exons.append(exon) # return ordered exon list return _order_intron_list(exons,order_by=order_by)