예제 #1
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def find_stopless3n_introns_on_orf(orfObj,
                                   has_branchpoint=False,
                                   has_polypyrimidine=False,
                                   order_by='length',
                                   **kwargs):
    """
    Find potential stopless3n introns on this orf

    @attention: **kwargs can contain other (here) unnecessarily arguments
    @attention: **kwargs are required in the merge_orfs_with_intron() function

    @type  orfObj: Orf object
    @param orfObj: Orf object which is scanned for stopless3n introns

    @rtype:  list
    @return: list with introns
    """
    # input validation
    IsOrf(orfObj)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_STOPLESS_3N_INTRONS)

    # find stopless3nintrons
    stopless3nintrons = merge_orfs_with_intron(orfObj, orfObj, **kwargs)

    # filter for presence of branchpoint / polypyrimidine tracks
    if has_branchpoint or has_polypyrimidine:
        filtered = []
        for intron in stopless3nintrons:
            intron.assign_bp_and_ppts()
            if has_branchpoint and not intron.branchpoint:
                continue
            intron_bp_dist = intron.get_branchpoint_nt_distance()
            if has_branchpoint and intron_bp_dist == None:
                continue
            intron_bp_optimality = min([
                abs(offset - intron_bp_dist)
                for offset in OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE
            ])
            if has_branchpoint and intron_bp_optimality > MAXIMAL_OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE:
                continue
            if has_polypyrimidine and not (intron.ppt5p or intron.ppt3p):
                continue
            # if here, accepted!
            filtered.append(intron)
    else:
        filtered = stopless3nintrons

    # return ordered intron list
    return _order_intron_list(filtered, order_by=order_by)
예제 #2
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def find_stopless3n_introns_on_orf(orfObj,
    has_branchpoint = False,
    has_polypyrimidine = False,
    order_by = 'length',**kwargs):
    """
    Find potential stopless3n introns on this orf

    @attention: **kwargs can contain other (here) unnecessarily arguments
    @attention: **kwargs are required in the merge_orfs_with_intron() function

    @type  orfObj: Orf object
    @param orfObj: Orf object which is scanned for stopless3n introns

    @rtype:  list
    @return: list with introns
    """
    # input validation
    IsOrf(orfObj)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_STOPLESS_3N_INTRONS)

    # find stopless3nintrons
    stopless3nintrons = merge_orfs_with_intron(orfObj,orfObj,**kwargs)

    # filter for presence of branchpoint / polypyrimidine tracks
    if has_branchpoint or has_polypyrimidine:
        filtered = []
        for intron in stopless3nintrons:
            intron.assign_bp_and_ppts()
            if has_branchpoint and not intron.branchpoint:
                continue
            intron_bp_dist = intron.get_branchpoint_nt_distance()
            if has_branchpoint and intron_bp_dist == None:
                continue
            intron_bp_optimality = min([ abs(offset-intron_bp_dist) for offset in OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE ])
            if has_branchpoint and intron_bp_optimality > MAXIMAL_OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE:
                continue 
            if has_polypyrimidine and not (intron.ppt5p or intron.ppt3p):
                continue
            # if here, accepted!
            filtered.append( intron )
    else:
        filtered = stopless3nintrons

    # return ordered intron list
    return _order_intron_list(filtered,order_by=order_by)
예제 #3
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def merge_orfs_with_two_tinyexons(preceding_orf,subsequent_orf,
    preceding_donor_sites=[],
    subsequent_acceptor_sites=[],
    orflist=[],**kwargs):
    """
    Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @attention: see get_potential_tiny_exons_on_orf for additional **kwargs

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron )

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON)

    # return list with (intron,tinyexon,intron) tuples
    returntinyexons = []
    tinyexoncollection = []
    tinyexoncombis = []
    min_preceding_donor_sites_pos     = min([ d.pos for d in preceding_donor_sites ])
    max_subsequent_acceptor_sites_pos = max([ a.pos for a in subsequent_acceptor_sites ]) 

    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        min_pos = min_preceding_donor_sites_pos + kwargs['min_tinyexon_intron_nt_length']
        max_pos = max_subsequent_acceptor_sites_pos - kwargs['min_tinyexon_intron_nt_length']
        # if so, do not check this Orf
        if orfX.endPY   <= min_pos: continue
        if orfX.startPY >= max_pos: continue
        # extend the tinyexoncollection
        tinyexoncollection.extend( get_potential_tiny_exons_on_orf(orfX,**kwargs) )

    # make tinyexoncollection ordered on start pos
    tinyexoncollection = _order_intron_list(tinyexoncollection,order_by='donor_pos')
    # donor_pos makes REVERSE ordering; restore this by reversing
    tinyexoncollection.reverse()

    # make 2-elemented tuples of tinyexons which can co-occur together
    for tinyexon1 in tinyexoncollection:
        for pos in range(len(tinyexoncollection)-1,-1,-1):
            tinyexon2 = tinyexoncollection[pos]
            if tinyexon2.donor.pos < tinyexon1.donor.pos: break
            intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos
            if intron_length < kwargs['min_tinyexon_intron_nt_length']: continue
            if intron_length > kwargs['max_tinyexon_intron_nt_length']: continue
            if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue
            # if here, elegiable combi!
            intron = IntronConnectingOrfs(
                    tinyexon1.donor,tinyexon2.acceptor,
                    get_shared_nucleotides_at_splicesite(
                            subsequent_orf,preceding_orf,
                            tinyexon2.acceptor,tinyexon1.donor
                            ),
                    preceding_orf,subsequent_orf)
            totlen = tinyexon1.length+tinyexon2.length
            combi = ( totlen, tinyexon1, intron, tinyexon2 )
            tinyexoncombis.append( combi )

    # return an ordered list based on length
    tinyexoncombis.sort()
    return [ (exon1,intron,exon2) for l,exon1,intron,exon2 in tinyexoncombis ]
예제 #4
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def get_potential_tiny_exons_on_orf(orfX,order_by='length',
    max_tinyexon_nt_length      =TINYEXON_MAX_NT_LENGTH,
    min_tinyexon_nt_length      =TINYEXON_MIN_NT_LENGTH,
    min_donor_pssm_score        =TINYEXON_MIN_DONOR_PSSM_SCORE,
    min_acceptor_pssm_score     =TINYEXON_MIN_ACCEPTOR_PSSM_SCORE,
    allow_non_canonical_donor   = TINYEXON_ALLOW_NON_CANONICAL_DONOR,
    allow_non_canonical_acceptor= TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR,
    non_canonical_min_donor_pssm_score    = TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
    non_canonical_min_acceptor_pssm_score = TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE,
    min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE,**kwargs):
    """
    Predict all possible tiny exons on this Orf

    @attention: **kwargs can contain other (here) unnecessarily arguments

    @type  orfX: Orf object
	@param orfX: Orf object to scan for a tinyexon

    @type  max_tinyexon_nt_length: integer
	@param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt

    @type  min_tinyexon_nt_length: integer
	@param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt

    @type  min_total_pssm_score: float or None
	@param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon

    @type  min_donor_pssm_score: float or None
	@param min_donor_pssm_score: minimal donor pssm score of tinyexon

    @type  min_acceptor_pssm_score: float or None
	@param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon

    @type  allow_non_canonical_donor: Boolean
    @param allow_non_canonical_donor: search for non-canonical donor sites too

    @type  allow_non_canonical_acceptor: Boolean
    @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too

    @type  non_canonical_min_donor_pssm_score: float
    @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor

    @type  non_canonical_min_acceptor_pssm_score: float
    @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor 

    @type  order_by: TODO
	@param order_by: TODO

    @rtype:  list
    @return: list with tinyexons
    """

    # scan for splice sites on this (tiny) orf
    forced = orfX._donor_sites == []
    orfX.scan_orf_for_pssm_splice_sites(
            splicetype="donor",forced=forced,
            min_pssm_score=min_donor_pssm_score,
            allow_non_canonical=allow_non_canonical_donor,
            non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score)
    forced = orfX._acceptor_sites == []
    orfX.scan_orf_for_pssm_splice_sites(
            splicetype="acceptor",forced=forced,
            min_pssm_score=min_acceptor_pssm_score,
            allow_non_canonical=allow_non_canonical_acceptor,
            non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score)

    # return list with exons
    tinyexons = []

    # most quickest scan possible: are there donors & acceptors?
    if orfX._donor_sites == [] or orfX._acceptor_sites == []:
        # no exons possible because splice sites are missing
        return tinyexons

    # and combine sites to exons!
    for acceptor in orfX._acceptor_sites:
        for donor in orfX._donor_sites:
            # length of exon
            exon_length = donor.pos - acceptor.pos
            # continue if exon to short
            if exon_length < min_tinyexon_nt_length: continue
            # continue if exon to long
            if exon_length > max_tinyexon_nt_length: continue

            # check sum of donor and acceptor pssm score
            if (min_total_pssm_score or min_total_pssm_score==0.0) and\
            donor.pssm_score + acceptor.pssm_score < min_total_pssm_score:
                continue

            # (re) check individual PSSM scores; in case this Orf had
            # already pre-assigned slice sites, potentially stricter
            # parameters are not applied!
            if donor.is_canonical() and\
            (min_donor_pssm_score or min_donor_pssm_score == 0.0) and\
            donor.pssm_score < min_donor_pssm_score:
                continue
            if not donor.is_canonical() and\
            (non_canonical_min_donor_pssm_score or non_canonical_min_donor_pssm_score == 0.0) and\
            donor.pssm_score < non_canonical_min_donor_pssm_score:
                continue
            if (min_acceptor_pssm_score or min_acceptor_pssm_score == 0.0) and\
            acceptor.pssm_score < min_acceptor_pssm_score:
                continue

            # make a Exon object
            exon = ExonOnOrf(acceptor,donor,orfX)
            tinyexons.append(exon)

    # return ordered exon list
    return _order_intron_list(tinyexons,order_by=order_by)
예제 #5
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def merge_orfs_with_intron(orfD,orfA,
    max_intron_nt_length         = MAX_INTRON_NT_LENGTH,
    min_intron_nt_length         = MIN_INTRON_NT_LENGTH,
    min_donor_pssm_score         = MIN_DONOR_PSSM_SCORE,
    min_acceptor_pssm_score      = MIN_ACCEPTOR_PSSM_SCORE,
    allow_non_canonical_donor    = ALLOW_NON_CANONICAL_DONOR,
    allow_non_canonical_acceptor = ALLOW_NON_CANONICAL_ACCEPTOR,
    non_canonical_min_donor_pssm_score    = NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
    non_canonical_min_acceptor_pssm_score = NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE,
    min_donor_pos=None,
    max_donor_pos=None,
    min_acceptor_pos=None,
    max_acceptor_pos=None,
    order_by = 'length',**kwargs):
    """
    Merge 2 Orf objects by introns

    @attention: **kwargs can contain other (here) unnecessarily arguments

    @type  orfD: Orf object
    @param orfD: Orf object that has to deliver a PSSM donor object

    @type  orfA: Orf object
    @param orfA: Orf object that has to deliver a PSSM acceptor object

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: maximal length (nt) of the intron
    
    @type  min_intron_nt_length: integer
    @param min_intron_nt_length: minimal length (nt) of the intron

    @type  min_donor_pssm_score: float
    @param min_donor_pssm_score: minimal pssm score of donor splice site

    @type  min_acceptor_pssm_score: float
    @param min_acceptor_pssm_score: minimal pssm score of acceptor splice site

    @type  allow_non_canonical_donor: Boolean
    @param allow_non_canonical_donor: search for non-canonical donor sites too

    @type  allow_non_canonical_acceptor: Boolean
    @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too

    @type  non_canonical_min_donor_pssm_score: float
    @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor

    @type  non_canonical_min_acceptor_pssm_score: float
    @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor 

    @rtype:  list
    @return: list with introns
    """
    # input validation
    IsOrf(orfD)
    IsOrf(orfA)

    # scan for splice sites (if not already done -> is checked in function)
    orfD.scan_orf_for_pssm_splice_sites(
            splicetype="donor",
            min_pssm_score=min_donor_pssm_score,
            allow_non_canonical=allow_non_canonical_donor,
            non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score)
    orfA.scan_orf_for_pssm_splice_sites(
            splicetype="acceptor",
            min_pssm_score=min_acceptor_pssm_score,
            allow_non_canonical=allow_non_canonical_acceptor,
            non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score)

    # return list with introns
    introns = []

    # most quickest scan possible: are there donors & acceptors?
    if orfD._donor_sites == [] or orfA._acceptor_sites == []:
        # no introns possible because splice sites are missing
        return introns

    # very quick scan: are exons not to far from each other?
    if max_intron_nt_length and\
    (orfA._acceptor_sites[0].pos - orfD._donor_sites[0].pos) > max_intron_nt_length:
        # no introns possible that can bridge this gap
        return introns

    for donor in orfD._donor_sites:
        if not allow_non_canonical_donor and not donor.is_canonical():
            continue
        elif donor.is_canonical() and donor.pssm_score < min_donor_pssm_score:
            continue
        elif not donor.is_canonical() and donor.pssm_score < non_canonical_min_donor_pssm_score:
            continue
        elif (min_donor_pos or min_donor_pos==0) and donor.pos < min_donor_pos:
            continue
        elif (max_donor_pos or max_donor_pos==0) and donor.pos > max_donor_pos:
            continue
        else:
            # donor site accepted
            pass 

        for acceptor in orfA._acceptor_sites:
            if not allow_non_canonical_acceptor and not acceptor.is_canonical():
                continue
            elif acceptor.is_canonical() and acceptor.pssm_score < min_acceptor_pssm_score:
                continue
            elif not acceptor.is_canonical() and acceptor.pssm_score < non_canonical_min_acceptor_pssm_score:
                continue
            elif (min_acceptor_pos or min_acceptor_pos==0) and acceptor.pos < min_acceptor_pos:
                continue
            elif (max_acceptor_pos or max_acceptor_pos==0) and acceptor.pos > max_acceptor_pos:
                continue
            else:
                # acceptor site accepted
                pass 

            # generate intron length and phase variable
            intron_length = acceptor.pos - donor.pos
            intron_phase  = intron_length % 3

            # check phase compatibilty (1) of splice sites
            if donor.phase != acceptor.phase: continue
            # check phase compatibilty (2) of splice sites
            if ( intron_phase + orfD.frame ) % 3 != orfA.frame % 3: continue

            # check if intron length is in between the boundaries
            if max_intron_nt_length and intron_length > max_intron_nt_length: continue
            if min_intron_nt_length and intron_length < min_intron_nt_length: continue

            # okay, if we reach this point, we have a valid intron
            shared_nts = get_shared_nucleotides_at_splicesite(
                    orfA,orfD,acceptor,donor
                    )

            # make a IntronConnectingOrfs object
            intron = IntronConnectingOrfs(donor,acceptor,shared_nts,orfD,orfA)
            introns.append(intron)

    # return ordered intron list
    return _order_intron_list(introns,order_by=order_by)
예제 #6
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def find_tiny_exon_on_orf(orfX,order_by='total_pssm',
    max_tinyexon_nt_length       =TINYEXON_MAX_NT_LENGTH,
    min_tinyexon_nt_length       =TINYEXON_MIN_NT_LENGTH,
    max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH,
    min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH,
    min_donor_pssm_score         =TINYEXON_MIN_DONOR_PSSM_SCORE,
    min_acceptor_pssm_score      =TINYEXON_MIN_ACCEPTOR_PSSM_SCORE,
    min_total_pssm_score         =TINYEXON_MIN_TOTAL_PSSM_SCORE,
    allow_non_canonical_donor    = TINYEXON_ALLOW_NON_CANONICAL_DONOR,
    allow_non_canonical_acceptor = TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR,
    non_canonical_min_donor_pssm_score    = TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
    non_canonical_min_acceptor_pssm_score = TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE,
    preceding_donor=None,
    subsequent_acceptor=None,
    preceding_donor_pos=None,
    subsequent_acceptor_pos=None):
    """
    Find a tiny exon on an orf by a leading donor and a trailing acceptor site.

    @type  orfX: Orf object
	@param orfX: Orf object to scan for a tinyexon

    @type  preceding_donor: object
	@param preceding_donor: SpliceDonorGT or SpliceDonor object

    @type  subsequent_acceptor: object
	@param subsequent_acceptor: SpliceAcceptorAG or SpliceAcceptor object

    @type  max_tinyexon_nt_length: integer
	@param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt

    @type  min_tinyexon_nt_length: integer
	@param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt

    @type  max_tinyexon_intron_nt_length: integer
    @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt

    @type  min_tinyexon_intron_nt_length: integer
    @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt

    @type  min_total_pssm_score: float or None
	@param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon

    @type  min_donor_pssm_score: float or None
	@param min_donor_pssm_score: minimal donor pssm score of tinyexon

    @type  min_acceptor_pssm_score: float or None
	@param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon

    @type  allow_non_canonical_donor: Boolean
    @param allow_non_canonical_donor: search for non-canonical donor sites too

    @type  allow_non_canonical_acceptor: Boolean
    @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too

    @type  non_canonical_min_donor_pssm_score: float
    @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor

    @type  non_canonical_min_acceptor_pssm_score: float
    @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor 

    @type  order_by: TODO
	@param order_by: TODO
    """
    # do some input data processing on preceding_donor
    if preceding_donor == None:
        # preceding donor MUST be set!
        message = "preceding_donor is not a `SpliceDonorGT` or `SpliceDonor` object"
        raise InproperlyAppliedArgument, message
    elif preceding_donor.__class__.__name__ in ['SpliceDonorGT','SpliceDonor']:
        pass
    else:
        message = "preceding_donor is not a `SpliceDonorGT` or `SpliceDonor` object, but a `%s`" % preceding_donor.__class__.__name__
        raise InproperlyAppliedArgument, message

    # do some input data processing on subsequent_acceptor
    if subsequent_acceptor == None:
        # subsequent acceptor MUST be set
        message = "subsequent_acceptor is not a `SpliceAcceptorAG` or `SpliceAcceptor` object"
        raise InproperlyAppliedArgument, message
    elif subsequent_acceptor.__class__.__name__ in ['SpliceAcceptorAG','SpliceAcceptor']:
        pass
    else:
        message = "subsequent_acceptor is not a `SpliceAcceptorAG` or `SpliceAcceptor` object, but a `%s`" % subsequent_acceptor.__class__.__name__
        raise InproperlyAppliedArgument, message

    # check phases of acceptor and donor
    if subsequent_acceptor.phase not in [0,1,2]:
        raise UnexpectedSpliceSitePhase
    if preceding_donor.phase not in [0,1,2]:
        raise UnexpectedSpliceSitePhase

    # some further integrity check on integer arguments
    for variable in ( max_tinyexon_nt_length, min_tinyexon_nt_length,
    max_tinyexon_intron_nt_length, min_tinyexon_intron_nt_length):
        try:
            variable = int(variable)
            if variable <= 0:
                raise "WUF... WUF..."
        except:
            message = "a variable is NOT a positive integer as expected"
            raise InproperlyAppliedArgument, message

    # scan for splice sites on this (tiny) orf
    orfX.scan_orf_for_pssm_splice_sites(
            splicetype="donor",
            min_pssm_score=min_donor_pssm_score,
            allow_non_canonical=allow_non_canonical_donor,
            non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score)
    orfX.scan_orf_for_pssm_splice_sites(
            splicetype="acceptor",
            min_pssm_score=min_acceptor_pssm_score,
            allow_non_canonical=allow_non_canonical_acceptor,
            non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score)

    # return list with exons
    exons = []

    # most quickest scan possible: are there donors & acceptors?
    if orfX._donor_sites == [] or orfX._acceptor_sites == []:
        # no exons possible because splice sites are missing
        return exons



    # make a list of compatible_acceptor_sites
    compatible_acceptor_sites = []
    for acceptor in orfX._acceptor_sites:
        # TODO: check! do we need a combi of donor and acceptor or acceptor and acceptor?
        if acceptor.phase != preceding_donor.phase:
            continue
        if acceptor.pssm_score < min_acceptor_pssm_score:
            continue
        if acceptor.pos - preceding_donor.pos < min_tinyexon_intron_nt_length:
            # intron to short
            continue
        if acceptor.pos - preceding_donor.pos > max_tinyexon_intron_nt_length:
            # intron to long
            continue
        # if we reach this point, compatible site!
        compatible_acceptor_sites.append( acceptor )

    # make a list of compatible_donor_sites
    compatible_donor_sites = []
    for donor in orfX._donor_sites:
        # TODO: check! do we need a combi of donor and acceptor or donor and donor?
        if donor.phase != subsequent_acceptor.phase:
            continue
        if donor.pssm_score < min_donor_pssm_score:
            continue
        if subsequent_acceptor.pos - donor.pos > max_tinyexon_intron_nt_length:
            # intron to long
            continue
        if subsequent_acceptor.pos - donor.pos < min_tinyexon_intron_nt_length:
            # intron to short
            continue
        # if we reach this point, compatible site!
        compatible_donor_sites.append( donor )

    # and combine sites to exons!
    for acceptor in compatible_acceptor_sites:
        for donor in compatible_donor_sites:
            # length of exon
            exon_length = donor.pos - acceptor.pos
            # continue if exon to short
            if exon_length < min_tinyexon_nt_length: continue
            # continue if exon to long
            if exon_length > max_tinyexon_nt_length: continue

            # check sum of donor and acceptor pssm score
            if (min_total_pssm_score or min_total_pssm_score==0.0) and\
            donor.pssm_score + acceptor.pssm_score < min_total_pssm_score:
                continue

            # make a Exon object
            exon = ExonOnOrf(acceptor,donor,orfX)
            exons.append(exon)

    # return ordered exon list
    return _order_intron_list(exons,order_by=order_by)
예제 #7
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def merge_orfs_with_two_tinyexons(preceding_orf,
                                  subsequent_orf,
                                  preceding_donor_sites=[],
                                  subsequent_acceptor_sites=[],
                                  orflist=[],
                                  **kwargs):
    """
    Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @attention: see get_potential_tiny_exons_on_orf for additional **kwargs

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron )

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON)

    # return list with (intron,tinyexon,intron) tuples
    returntinyexons = []
    tinyexoncollection = []
    tinyexoncombis = []
    min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites])
    max_subsequent_acceptor_sites_pos = max(
        [a.pos for a in subsequent_acceptor_sites])

    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        min_pos = min_preceding_donor_sites_pos + kwargs[
            'min_tinyexon_intron_nt_length']
        max_pos = max_subsequent_acceptor_sites_pos - kwargs[
            'min_tinyexon_intron_nt_length']
        # if so, do not check this Orf
        if orfX.endPY <= min_pos: continue
        if orfX.startPY >= max_pos: continue
        # extend the tinyexoncollection
        tinyexoncollection.extend(
            get_potential_tiny_exons_on_orf(orfX, **kwargs))

    # make tinyexoncollection ordered on start pos
    tinyexoncollection = _order_intron_list(tinyexoncollection,
                                            order_by='donor_pos')
    # donor_pos makes REVERSE ordering; restore this by reversing
    tinyexoncollection.reverse()

    # make 2-elemented tuples of tinyexons which can co-occur together
    for tinyexon1 in tinyexoncollection:
        for pos in range(len(tinyexoncollection) - 1, -1, -1):
            tinyexon2 = tinyexoncollection[pos]
            if tinyexon2.donor.pos < tinyexon1.donor.pos: break
            intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos
            if intron_length < kwargs['min_tinyexon_intron_nt_length']:
                continue
            if intron_length > kwargs['max_tinyexon_intron_nt_length']:
                continue
            if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue
            # if here, elegiable combi!
            intron = IntronConnectingOrfs(
                tinyexon1.donor, tinyexon2.acceptor,
                get_shared_nucleotides_at_splicesite(subsequent_orf,
                                                     preceding_orf,
                                                     tinyexon2.acceptor,
                                                     tinyexon1.donor),
                preceding_orf, subsequent_orf)
            totlen = tinyexon1.length + tinyexon2.length
            combi = (totlen, tinyexon1, intron, tinyexon2)
            tinyexoncombis.append(combi)

    # return an ordered list based on length
    tinyexoncombis.sort()
    return [(exon1, intron, exon2)
            for l, exon1, intron, exon2 in tinyexoncombis]
예제 #8
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def get_potential_tiny_exons_on_orf(
        orfX,
        order_by='length',
        max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH,
        min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH,
        min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE,
        min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE,
        allow_non_canonical_donor=TINYEXON_ALLOW_NON_CANONICAL_DONOR,
        allow_non_canonical_acceptor=TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR,
        non_canonical_min_donor_pssm_score=TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
        non_canonical_min_acceptor_pssm_score=TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE,
        min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE,
        **kwargs):
    """
    Predict all possible tiny exons on this Orf

    @attention: **kwargs can contain other (here) unnecessarily arguments

    @type  orfX: Orf object
	@param orfX: Orf object to scan for a tinyexon

    @type  max_tinyexon_nt_length: integer
	@param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt

    @type  min_tinyexon_nt_length: integer
	@param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt

    @type  min_total_pssm_score: float or None
	@param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon

    @type  min_donor_pssm_score: float or None
	@param min_donor_pssm_score: minimal donor pssm score of tinyexon

    @type  min_acceptor_pssm_score: float or None
	@param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon

    @type  allow_non_canonical_donor: Boolean
    @param allow_non_canonical_donor: search for non-canonical donor sites too

    @type  allow_non_canonical_acceptor: Boolean
    @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too

    @type  non_canonical_min_donor_pssm_score: float
    @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor

    @type  non_canonical_min_acceptor_pssm_score: float
    @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor 

    @type  order_by: TODO
	@param order_by: TODO

    @rtype:  list
    @return: list with tinyexons
    """

    # scan for splice sites on this (tiny) orf
    forced = orfX._donor_sites == []
    orfX.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        forced=forced,
        min_pssm_score=min_donor_pssm_score,
        allow_non_canonical=allow_non_canonical_donor,
        non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score)
    forced = orfX._acceptor_sites == []
    orfX.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        forced=forced,
        min_pssm_score=min_acceptor_pssm_score,
        allow_non_canonical=allow_non_canonical_acceptor,
        non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score)

    # return list with exons
    tinyexons = []

    # most quickest scan possible: are there donors & acceptors?
    if orfX._donor_sites == [] or orfX._acceptor_sites == []:
        # no exons possible because splice sites are missing
        return tinyexons

    # and combine sites to exons!
    for acceptor in orfX._acceptor_sites:
        for donor in orfX._donor_sites:
            # length of exon
            exon_length = donor.pos - acceptor.pos
            # continue if exon to short
            if exon_length < min_tinyexon_nt_length: continue
            # continue if exon to long
            if exon_length > max_tinyexon_nt_length: continue

            # check sum of donor and acceptor pssm score
            if (min_total_pssm_score or min_total_pssm_score==0.0) and\
            donor.pssm_score + acceptor.pssm_score < min_total_pssm_score:
                continue

            # (re) check individual PSSM scores; in case this Orf had
            # already pre-assigned slice sites, potentially stricter
            # parameters are not applied!
            if donor.is_canonical() and\
            (min_donor_pssm_score or min_donor_pssm_score == 0.0) and\
            donor.pssm_score < min_donor_pssm_score:
                continue
            if not donor.is_canonical() and\
            (non_canonical_min_donor_pssm_score or non_canonical_min_donor_pssm_score == 0.0) and\
            donor.pssm_score < non_canonical_min_donor_pssm_score:
                continue
            if (min_acceptor_pssm_score or min_acceptor_pssm_score == 0.0) and\
            acceptor.pssm_score < min_acceptor_pssm_score:
                continue

            # make a Exon object
            exon = ExonOnOrf(acceptor, donor, orfX)
            tinyexons.append(exon)

    # return ordered exon list
    return _order_intron_list(tinyexons, order_by=order_by)
예제 #9
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def merge_orfs_with_intron(
        orfD,
        orfA,
        max_intron_nt_length=MAX_INTRON_NT_LENGTH,
        min_intron_nt_length=MIN_INTRON_NT_LENGTH,
        min_donor_pssm_score=MIN_DONOR_PSSM_SCORE,
        min_acceptor_pssm_score=MIN_ACCEPTOR_PSSM_SCORE,
        allow_non_canonical_donor=ALLOW_NON_CANONICAL_DONOR,
        allow_non_canonical_acceptor=ALLOW_NON_CANONICAL_ACCEPTOR,
        non_canonical_min_donor_pssm_score=NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
        non_canonical_min_acceptor_pssm_score=NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE,
        min_donor_pos=None,
        max_donor_pos=None,
        min_acceptor_pos=None,
        max_acceptor_pos=None,
        order_by='length',
        **kwargs):
    """
    Merge 2 Orf objects by introns

    @attention: **kwargs can contain other (here) unnecessarily arguments

    @type  orfD: Orf object
    @param orfD: Orf object that has to deliver a PSSM donor object

    @type  orfA: Orf object
    @param orfA: Orf object that has to deliver a PSSM acceptor object

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: maximal length (nt) of the intron
    
    @type  min_intron_nt_length: integer
    @param min_intron_nt_length: minimal length (nt) of the intron

    @type  min_donor_pssm_score: float
    @param min_donor_pssm_score: minimal pssm score of donor splice site

    @type  min_acceptor_pssm_score: float
    @param min_acceptor_pssm_score: minimal pssm score of acceptor splice site

    @type  allow_non_canonical_donor: Boolean
    @param allow_non_canonical_donor: search for non-canonical donor sites too

    @type  allow_non_canonical_acceptor: Boolean
    @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too

    @type  non_canonical_min_donor_pssm_score: float
    @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor

    @type  non_canonical_min_acceptor_pssm_score: float
    @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor 

    @rtype:  list
    @return: list with introns
    """
    # input validation
    IsOrf(orfD)
    IsOrf(orfA)

    # scan for splice sites (if not already done -> is checked in function)
    orfD.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=min_donor_pssm_score,
        allow_non_canonical=allow_non_canonical_donor,
        non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score)
    orfA.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=min_acceptor_pssm_score,
        allow_non_canonical=allow_non_canonical_acceptor,
        non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score)

    # return list with introns
    introns = []

    # most quickest scan possible: are there donors & acceptors?
    if orfD._donor_sites == [] or orfA._acceptor_sites == []:
        # no introns possible because splice sites are missing
        return introns

    # very quick scan: are exons not to far from each other?
    if max_intron_nt_length and\
    (orfA._acceptor_sites[0].pos - orfD._donor_sites[0].pos) > max_intron_nt_length:
        # no introns possible that can bridge this gap
        return introns

    for donor in orfD._donor_sites:
        if not allow_non_canonical_donor and not donor.is_canonical():
            continue
        elif donor.is_canonical() and donor.pssm_score < min_donor_pssm_score:
            continue
        elif not donor.is_canonical(
        ) and donor.pssm_score < non_canonical_min_donor_pssm_score:
            continue
        elif (min_donor_pos
              or min_donor_pos == 0) and donor.pos < min_donor_pos:
            continue
        elif (max_donor_pos
              or max_donor_pos == 0) and donor.pos > max_donor_pos:
            continue
        else:
            # donor site accepted
            pass

        for acceptor in orfA._acceptor_sites:
            if not allow_non_canonical_acceptor and not acceptor.is_canonical(
            ):
                continue
            elif acceptor.is_canonical(
            ) and acceptor.pssm_score < min_acceptor_pssm_score:
                continue
            elif not acceptor.is_canonical(
            ) and acceptor.pssm_score < non_canonical_min_acceptor_pssm_score:
                continue
            elif (min_acceptor_pos or min_acceptor_pos
                  == 0) and acceptor.pos < min_acceptor_pos:
                continue
            elif (max_acceptor_pos or max_acceptor_pos
                  == 0) and acceptor.pos > max_acceptor_pos:
                continue
            else:
                # acceptor site accepted
                pass

            # generate intron length and phase variable
            intron_length = acceptor.pos - donor.pos
            intron_phase = intron_length % 3

            # check phase compatibilty (1) of splice sites
            if donor.phase != acceptor.phase: continue
            # check phase compatibilty (2) of splice sites
            if (intron_phase + orfD.frame) % 3 != orfA.frame % 3: continue

            # check if intron length is in between the boundaries
            if max_intron_nt_length and intron_length > max_intron_nt_length:
                continue
            if min_intron_nt_length and intron_length < min_intron_nt_length:
                continue

            # okay, if we reach this point, we have a valid intron
            shared_nts = get_shared_nucleotides_at_splicesite(
                orfA, orfD, acceptor, donor)

            # make a IntronConnectingOrfs object
            intron = IntronConnectingOrfs(donor, acceptor, shared_nts, orfD,
                                          orfA)
            introns.append(intron)

    # return ordered intron list
    return _order_intron_list(introns, order_by=order_by)
예제 #10
0
파일: orfs.py 프로젝트: IanReid/ABFGP
def find_tiny_exon_on_orf(
        orfX,
        order_by='total_pssm',
        max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH,
        min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH,
        max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH,
        min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH,
        min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE,
        min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE,
        min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE,
        allow_non_canonical_donor=TINYEXON_ALLOW_NON_CANONICAL_DONOR,
        allow_non_canonical_acceptor=TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR,
        non_canonical_min_donor_pssm_score=TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
        non_canonical_min_acceptor_pssm_score=TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE,
        preceding_donor=None,
        subsequent_acceptor=None,
        preceding_donor_pos=None,
        subsequent_acceptor_pos=None):
    """
    Find a tiny exon on an orf by a leading donor and a trailing acceptor site.

    @type  orfX: Orf object
	@param orfX: Orf object to scan for a tinyexon

    @type  preceding_donor: object
	@param preceding_donor: SpliceDonorGT or SpliceDonor object

    @type  subsequent_acceptor: object
	@param subsequent_acceptor: SpliceAcceptorAG or SpliceAcceptor object

    @type  max_tinyexon_nt_length: integer
	@param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt

    @type  min_tinyexon_nt_length: integer
	@param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt

    @type  max_tinyexon_intron_nt_length: integer
    @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt

    @type  min_tinyexon_intron_nt_length: integer
    @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt

    @type  min_total_pssm_score: float or None
	@param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon

    @type  min_donor_pssm_score: float or None
	@param min_donor_pssm_score: minimal donor pssm score of tinyexon

    @type  min_acceptor_pssm_score: float or None
	@param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon

    @type  allow_non_canonical_donor: Boolean
    @param allow_non_canonical_donor: search for non-canonical donor sites too

    @type  allow_non_canonical_acceptor: Boolean
    @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too

    @type  non_canonical_min_donor_pssm_score: float
    @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor

    @type  non_canonical_min_acceptor_pssm_score: float
    @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor 

    @type  order_by: TODO
	@param order_by: TODO
    """
    # do some input data processing on preceding_donor
    if preceding_donor == None:
        # preceding donor MUST be set!
        message = "preceding_donor is not a `SpliceDonorGT` or `SpliceDonor` object"
        raise InproperlyAppliedArgument, message
    elif preceding_donor.__class__.__name__ in [
            'SpliceDonorGT', 'SpliceDonor'
    ]:
        pass
    else:
        message = "preceding_donor is not a `SpliceDonorGT` or `SpliceDonor` object, but a `%s`" % preceding_donor.__class__.__name__
        raise InproperlyAppliedArgument, message

    # do some input data processing on subsequent_acceptor
    if subsequent_acceptor == None:
        # subsequent acceptor MUST be set
        message = "subsequent_acceptor is not a `SpliceAcceptorAG` or `SpliceAcceptor` object"
        raise InproperlyAppliedArgument, message
    elif subsequent_acceptor.__class__.__name__ in [
            'SpliceAcceptorAG', 'SpliceAcceptor'
    ]:
        pass
    else:
        message = "subsequent_acceptor is not a `SpliceAcceptorAG` or `SpliceAcceptor` object, but a `%s`" % subsequent_acceptor.__class__.__name__
        raise InproperlyAppliedArgument, message

    # check phases of acceptor and donor
    if subsequent_acceptor.phase not in [0, 1, 2]:
        raise UnexpectedSpliceSitePhase
    if preceding_donor.phase not in [0, 1, 2]:
        raise UnexpectedSpliceSitePhase

    # some further integrity check on integer arguments
    for variable in (max_tinyexon_nt_length, min_tinyexon_nt_length,
                     max_tinyexon_intron_nt_length,
                     min_tinyexon_intron_nt_length):
        try:
            variable = int(variable)
            if variable <= 0:
                raise "WUF... WUF..."
        except:
            message = "a variable is NOT a positive integer as expected"
            raise InproperlyAppliedArgument, message

    # scan for splice sites on this (tiny) orf
    orfX.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=min_donor_pssm_score,
        allow_non_canonical=allow_non_canonical_donor,
        non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score)
    orfX.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=min_acceptor_pssm_score,
        allow_non_canonical=allow_non_canonical_acceptor,
        non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score)

    # return list with exons
    exons = []

    # most quickest scan possible: are there donors & acceptors?
    if orfX._donor_sites == [] or orfX._acceptor_sites == []:
        # no exons possible because splice sites are missing
        return exons

    # make a list of compatible_acceptor_sites
    compatible_acceptor_sites = []
    for acceptor in orfX._acceptor_sites:
        # TODO: check! do we need a combi of donor and acceptor or acceptor and acceptor?
        if acceptor.phase != preceding_donor.phase:
            continue
        if acceptor.pssm_score < min_acceptor_pssm_score:
            continue
        if acceptor.pos - preceding_donor.pos < min_tinyexon_intron_nt_length:
            # intron to short
            continue
        if acceptor.pos - preceding_donor.pos > max_tinyexon_intron_nt_length:
            # intron to long
            continue
        # if we reach this point, compatible site!
        compatible_acceptor_sites.append(acceptor)

    # make a list of compatible_donor_sites
    compatible_donor_sites = []
    for donor in orfX._donor_sites:
        # TODO: check! do we need a combi of donor and acceptor or donor and donor?
        if donor.phase != subsequent_acceptor.phase:
            continue
        if donor.pssm_score < min_donor_pssm_score:
            continue
        if subsequent_acceptor.pos - donor.pos > max_tinyexon_intron_nt_length:
            # intron to long
            continue
        if subsequent_acceptor.pos - donor.pos < min_tinyexon_intron_nt_length:
            # intron to short
            continue
        # if we reach this point, compatible site!
        compatible_donor_sites.append(donor)

    # and combine sites to exons!
    for acceptor in compatible_acceptor_sites:
        for donor in compatible_donor_sites:
            # length of exon
            exon_length = donor.pos - acceptor.pos
            # continue if exon to short
            if exon_length < min_tinyexon_nt_length: continue
            # continue if exon to long
            if exon_length > max_tinyexon_nt_length: continue

            # check sum of donor and acceptor pssm score
            if (min_total_pssm_score or min_total_pssm_score==0.0) and\
            donor.pssm_score + acceptor.pssm_score < min_total_pssm_score:
                continue

            # make a Exon object
            exon = ExonOnOrf(acceptor, donor, orfX)
            exons.append(exon)

    # return ordered exon list
    return _order_intron_list(exons, order_by=order_by)