Exemplo n.º 1
0
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD,
                                                pacbporfA,
                                                verbose=False,
                                                **kwargs):
    """
    Merge query Orfs in PacbPORF by **best** intron

    @attention: see orfs.merge_orfs_with_intron for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max(
        [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
    max_accep_query_pos = min(
        [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    # get list of introns
    intronlist = merge_orfs_with_intron(pacbporfD.orfQ,
                                        pacbporfA.orfQ,
                                        min_donor_pos=min_donor_query_pos,
                                        max_acceptor_pos=max_accep_query_pos,
                                        **kwargs)

    # filter on entropy
    # settings for minimal alignment entropy score
    if min([pacbporfD.identityscore, pacbporfA.identityscore]) > 0.55:
        min_donor_site_entropy = 0.01
        min_acceptor_site_entropy = 0.01
        intronlist = _filter_introns_on_entropy(
            intronlist,
            pacbporfD,
            pacbporfA,
            min_donor_site_entropy=min_donor_site_entropy,
            min_acceptor_site_entropy=min_acceptor_site_entropy)
    else:
        # do not filter, but do not forget to store apps data to intron(s)
        for intron in intronlist:
            succes = set_apps_intron_query(intron, pacbporfD, pacbporfA)

    for intron in intronlist:
        intron._distance = 0  # ??
        # set GFF fsource attribute for recognition of intron sources
        intron._gff['fsource'] = 'ABGPbridgeing'

    # get unique list of donors & acceptors
    donor = olba(list(Set([intron.donor for intron in intronlist])),
                 order_by='pos')
    accep = olba(list(Set([intron.acceptor for intron in intronlist])),
                 order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep]
    ############################################################################

    intronlist = _filter_introns_on_pssm_entropy_combination(intronlist)

    # get unique list of donors & acceptors
    donor = olba(list(Set([intron.donor for intron in intronlist])),
                 order_by='pos')
    accep = olba(list(Set([intron.acceptor for intron in intronlist])),
                 order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep]
    ############################################################################

    filtered_intron_list = []
    for intron in intronlist:
        intron.assign_bp_and_ppts()
        if intron.branchpoint and (intron.ppt5p or intron.ppt3p):
            filtered_intron_list.append(intron)
        else:
            pass

    # check if list is emptied due to branchpoint filtering
    # in that case, filter for either branchpoint OR polyppt
    if not filtered_intron_list and intronlist:
        for intron in intronlist:
            if intron.branchpoint or (intron.ppt5p or intron.ppt3p):
                filtered_intron_list.append(intron)

    # return list of filtered introns
    return filtered_intron_list
Exemplo n.º 2
0
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD,pacbporfA,verbose=False,**kwargs):
    """
    Merge query Orfs in PacbPORF by **best** intron

    @attention: see orfs.merge_orfs_with_intron for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
    max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    # get list of introns
    intronlist = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,
            min_donor_pos   =min_donor_query_pos,
            max_acceptor_pos=max_accep_query_pos,**kwargs)


    # filter on entropy
    # settings for minimal alignment entropy score
    if min([pacbporfD.identityscore,pacbporfA.identityscore]) > 0.55:
        min_donor_site_entropy = 0.01
        min_acceptor_site_entropy = 0.01
        intronlist = _filter_introns_on_entropy(intronlist,pacbporfD,pacbporfA,
                min_donor_site_entropy=min_donor_site_entropy,
                min_acceptor_site_entropy=min_acceptor_site_entropy)
    else:
        # do not filter, but do not forget to store apps data to intron(s)
        for intron in intronlist:
            succes = set_apps_intron_query(intron,pacbporfD,pacbporfA)


    for intron in intronlist:
        intron._distance = 0 # ??
        # set GFF fsource attribute for recognition of intron sources
        intron._gff['fsource'] = 'ABGPbridgeing'

    # get unique list of donors & acceptors
    donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos')
    accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos')

    ############################################################################
    if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep]
    ############################################################################

    intronlist = _filter_introns_on_pssm_entropy_combination(intronlist)

    # get unique list of donors & acceptors
    donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos')
    accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos')

    ############################################################################
    if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep]
    ############################################################################

    filtered_intron_list = []
    for intron in intronlist:
        intron.assign_bp_and_ppts()
        if intron.branchpoint and (intron.ppt5p or intron.ppt3p):
            filtered_intron_list.append( intron )
        else:
            pass

    # check if list is emptied due to branchpoint filtering
    # in that case, filter for either branchpoint OR polyppt
    if not filtered_intron_list and intronlist:
        for intron in intronlist:
            if intron.branchpoint or (intron.ppt5p or intron.ppt3p):
                filtered_intron_list.append( intron )

    # return list of filtered introns
    return filtered_intron_list
Exemplo n.º 3
0
def merge_pacbporfs_by_tinyexons(pacbporfD,pacbporfA,
    orfSetObjQ,orfSetObjS,verbose=False,**kwargs):
    """ """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    resultlistQ = merge_orfs_with_tinyexon(
            pacbporfD.orfQ,pacbporfA.orfQ,
            preceding_donor_sites=pacbporfD.orfQ._donor_sites,
            subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites,
            orflist=orfSetObjQ.orfs,**kwargs)
    resultlistS = merge_orfs_with_tinyexon(
            pacbporfD.orfS,pacbporfA.orfS,
            preceding_donor_sites=pacbporfD.orfS._donor_sites,
            subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites,
            orflist=orfSetObjS.orfs,**kwargs)

    # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ]
    resultdictQ,key2exonQ = _tinyexon_list_2_dict(resultlistQ)
    resultdictS,key2exonS = _tinyexon_list_2_dict(resultlistS)

    # get unique list of donors & acceptors
    donorQ = olba( list(Set([inD.donor for inD,te,inA in resultlistQ ])), order_by='pos')
    donorS = olba( list(Set([inD.donor for inD,te,inA in resultlistS ])), order_by='pos')
    accepQ = olba( list(Set([inA.acceptor for inD,te,inA in resultlistQ ])), order_by='pos')
    accepS = olba( list(Set([inA.acceptor for inD,te,inA in resultlistS ])), order_by='pos')

    ## filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical']               = True # True
    kwargs['aligned_site_max_triplet_distance'] = 0     # 2
    algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs)
    algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs)

    # settings for minimal alignment entropy score
    # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!!
    min_donor_site_alignment_entropy = 0.1
    min_acceptor_site_alignment_entropy = 0.1


    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor',
                min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor',
                min_alignment_entropy=min_acceptor_site_alignment_entropy)

    # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS
    return_list = []

    ############################################################################
    if verbose:
        print "bridges constructed: ORFS:",
        print (pacbporfD.orfQ.id,pacbporfA.orfQ.id),
        print (pacbporfD.orfS.id,pacbporfA.orfS.id),
        print len(resultdictQ), len(resultdictS),
        print ( len(resultlistQ), len(donorQ), len(accepQ) ),
        print ( len(resultlistS), len(donorS), len(accepS) ),
        print ( len(algdonors), len(algacceps) )
    ############################################################################

    for keyQ,tinyexonQ in key2exonQ.iteritems():
        for keyS,tinyexonS in key2exonS.iteritems():
            if tinyexonQ.donor.phase != tinyexonS.donor.phase:
                continue
            if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase:
                continue
            if tinyexonQ.length != tinyexonS.length:
                continue
            # if here, then tinyexons of identical structure


            ####################################################################
            if verbose:
                print tinyexonQ.length, tinyexonQ.donor.phase,
                print ( len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1]) ),
                print ( len(resultdictS[keyS][0]), len(resultdictS[keyS][1]) ),
                print tinyexonQ,
                print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(),
                print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score
            ####################################################################

            donor_introns = []
            acceptor_introns = []
            for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems():
                if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]:
                    continue
                for intronDSkey, intronDS in resultdictS[keyS][0].iteritems():
                    if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]:
                        continue
                    # check if they exists as aligned sites
                    alignedkey = ( intronDQ.donor.pos, intronDS.donor.pos )
                    if alignedkey not in [ (dQ.pos, dS.pos) for dQ,dS in algdonors ]:
                        continue
                    # if here, we have a set of introns 5' of the tinyexon
                    # which are perfectly alignable!
                    donor_introns.append((intronDQ,intronDS))

            for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems():
                if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]:
                    continue
                for intronASkey, intronAS in resultdictS[keyS][1].iteritems():
                    if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]:
                        continue
                    # check if they exists as aligned sites
                    alignedkey = ( intronAQ.acceptor.pos, intronAS.acceptor.pos )
                    if alignedkey not in [ (aQ.pos, aS.pos) for aQ,aS in algacceps ]:
                        continue
                    # if here, we have a set of introns 3' of the tinyexon
                    # which are perfectly alignable!
                    acceptor_introns.append((intronAQ,intronAS))

            if not len(donor_introns) or not len(acceptor_introns):
                # no aligned 5' && aligned 3' introns
                continue

            # initialize extended tinyexon PacbPORF
            from pacb import PacbP
            pacbp = PacbP(input=( 
                    tinyexonQ.proteinsequence(),
                    tinyexonS.proteinsequence(),
                    tinyexonQ.protein_start(),
                    tinyexonS.protein_start(),
                    ) )
            pacbp.strip_unmatched_ends()
            # continue if no fraction could be aligned
            if len(pacbp) == 0: continue
            tinypacbporf = pacbp2pacbporf(pacbp,tinyexonQ.orf,tinyexonS.orf)
            tinypacbporf.extend_pacbporf_after_stops()

            ####################################################################
            if verbose:
                print tinypacbporf
                tinypacbporf.print_protein_and_dna()
                print len(donor_introns), len(acceptor_introns),
                print max([ dQ.donor.pssm_score+dS.donor.pssm_score for dQ,dS in donor_introns]),
                print max([ aQ.acceptor.pssm_score+aS.acceptor.pssm_score for aQ,aS in acceptor_introns])
            ####################################################################


            # if here, we have accepted tinyexon bridges!
            # gather them and store to return_list
            for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems():
                if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]:
                    continue
                for intronDSkey, intronDS in resultdictS[keyS][0].iteritems():
                    if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]:
                        continue
                    for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems():
                        if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]:
                            continue
                        for intronASkey, intronAS in resultdictS[keyS][1].iteritems():
                            if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]:
                                continue
                            ####################################################
                            # set some meta-data properties to the intron objects
                            ####################################################
                            _score_introns_obtained_by_mapping(
                                    intronDQ,intronDS,pacbporfD,
                                    tinypacbporf,source='ABGPmappingTE')
                            _score_introns_obtained_by_mapping(
                                    intronAQ,intronAS,tinypacbporf,
                                    pacbporfA,source='ABGPmappingTE')
                            # create _linked_to_xxx attributes
                            intronDQ._linked_to_pacbporfs = [ tinypacbporf ]
                            intronAQ._linked_to_pacbporfs = [ tinypacbporf ]
                            intronDS._linked_to_pacbporfs = [ tinypacbporf ]
                            intronAS._linked_to_pacbporfs = [ tinypacbporf ]
                            intronDQ._linked_to_introns   = [ intronAQ ]
                            intronAQ._linked_to_introns   = [ intronDQ ]
                            intronDS._linked_to_introns   = [ intronAS ]
                            intronAS._linked_to_introns   = [ intronDS ]
                            # append to tmp result list
                            return_list.append(
                                (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS)
                                )

    # check if there are >1 candidate tiny exons
    # currently, we choose only to return the **best** mapped tinyexon 
    if len(return_list) == 0:
        pass
    elif len(return_list) == 1:
        pass
    else:
        # only take the highest scoring candidate here 
        min_distance = min([ (a._distance+d._distance) for a,b,c,d,e in return_list ])
        pos2score = []
        for (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) in return_list:
            if (intronDQ._distance + intronAQ._distance) > min_distance:
                pos2score.append( 0.0 )
            else:
                # calculate overall pssm score
                total_pssm = 0.0
                total_pssm += intronDQ.donor.pssm_score
                total_pssm += intronDQ.acceptor.pssm_score
                total_pssm += intronDS.donor.pssm_score
                total_pssm += intronDS.acceptor.pssm_score
                total_pssm += intronAQ.donor.pssm_score
                total_pssm += intronAQ.acceptor.pssm_score
                total_pssm += intronAS.donor.pssm_score
                total_pssm += intronAS.acceptor.pssm_score
                pos2score.append( total_pssm )
        # get highest score and linked tinyexon
        max_score = max(pos2score)
        return_list = [ return_list[pos2score.index(max_score)] ]

    ############################################################################
    # some printing in verbose mode
    if verbose and return_list:
        (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) = return_list[0]
        print "BEST MAPPED TINYEXON:"
        print tinypacbporf
        print tinypacbporf.query, intronDQ._distance, intronAQ._distance,
        print ( intronDQ.donor.pos, intronDQ.acceptor.pos ),
        print ( intronDS.donor.pos, intronDS.acceptor.pos ),
        print ( intronAQ.donor.pos, intronAQ.acceptor.pos ),
        print ( intronAS.donor.pos, intronAS.acceptor.pos )
    ############################################################################

    # return the result list
    return return_list
Exemplo n.º 4
0
def merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=False,**kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see orfs.merge_orfs_with_intron for **kwargs
    @attention: see functions._filter_for_alignable_splice_sites for **kwargs
    @attention: see functions._filter_for_entropy for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
    max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    sdr = pacbporfD.alignment_dna_range_sbjct()
    sar = pacbporfA.alignment_dna_range_sbjct()
    min_donor_sbjct_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
    max_accep_sbjct_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    # get list of introns
    #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,
    #        min_donor_pos   =min_donor_query_pos,
    #        max_acceptor_pos=max_accep_query_pos,**kwargs)
    #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,
    #        min_donor_pos   =min_donor_sbjct_pos,
    #        max_acceptor_pos=max_accep_sbjct_pos,**kwargs)

    # get list of introns
    intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,**kwargs)
    intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,**kwargs)

    # get unique list of donors & acceptors
    donorQ = olba( list(Set([inQ.donor for inQ in intronsQ ])), order_by='pos')
    donorS = olba( list(Set([inS.donor for inS in intronsS ])), order_by='pos')
    accepQ = olba( list(Set([inQ.acceptor for inQ in intronsQ ])), order_by='pos')
    accepS = olba( list(Set([inS.acceptor for inS in intronsS ])), order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [ d.pos for d in donorQ ], "aQ1", [ a.pos for a in accepQ ]
        print "dS1", [ d.pos for d in donorS ], "aS1", [ a.pos for a in accepS ]
    ############################################################################

    # filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor']
    algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs)
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor']
    algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs)

    ############################################################################
    if verbose:
        print "dQ2", [ _dq.pos for (_dq,_ds) in algdonors ],
        print "aQ2", [ _aq.pos for (_aq,_as) in algacceps ]
        print "dS2", [ _ds.pos for (_dq,_ds) in algdonors ],
        print "aS2", [ _as.pos for (_aq,_as) in algacceps ]
    ############################################################################

    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor',
                min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor',
                min_alignment_entropy=min_acceptor_site_alignment_entropy)

    ############################################################################
    if verbose:
        print "dQ3", [ _dq.pos for (_dq,_ds) in algdonors ],
        print "aQ3", [ _aq.pos for (_aq,_as) in algacceps ]
        print "dS3", [ _ds.pos for (_dq,_ds) in algdonors ],
        print "aS3", [ _as.pos for (_aq,_as) in algacceps ]
    ############################################################################


    # make unique position lists for quick lookup in intron lists
    dQpl = Set([ dQ.pos for dQ,dS in algdonors ])
    dSpl = Set([ dS.pos for dQ,dS in algdonors ])
    aQpl = Set([ aQ.pos for aQ,aS in algacceps ])
    aSpl = Set([ aS.pos for aQ,aS in algacceps ])

    # check exterior boundaries of PacbPORFs
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()

    # now make list of aligable introns
    algintrons = []
    for intQ in intronsQ:
        # check if intron falls within the PacbPORF aligned area
        if intQ.donor.pos <= sposD.query_dna_start: continue
        if intQ.acceptor.pos >= eposA.query_dna_end: continue
        if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl:
            # Query intron occurs in list of alignable splice sites!
            for intS in intronsS:
                # check if intron falls within the PacbPORF aligned area
                if intS.donor.pos <= sposD.sbjct_dna_start: continue
                if intS.acceptor.pos >= eposA.sbjct_dna_end: continue
                if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl:
                    # Sbjct intron occurs as well in alignable splice sites!
                    if (intQ.donor,intS.donor) in algdonors and\
                    (intQ.acceptor,intS.acceptor) in algacceps:
                        # Sbjct & Query Donor & Acceptor are alignable!
                        algintrons.append( ( intQ, intS ) )

    ############################################################################
    # set some meta-data properties to the intron objects
    ############################################################################
    for intQ,intS in algintrons:
        distDnt = pacbporfD.get_distance_aligned_nucleotide_positions(
                        query = intQ.donor.pos, sbjct = intS.donor.pos
                        )
        distAnt = pacbporfA.get_distance_aligned_nucleotide_positions(
                        query = intQ.acceptor.pos, sbjct = intS.acceptor.pos
                        )

        # final distance check. kwargs['aligned_site_max_triplet_distance']
        # is applied on donor and acceptor site. This distance measured on the
        # protein sequence can be DOUBLED in case distDnt / distAnt are
        # opposite (+ and -). Check here if the protein sequence gap is
        # as well <= kwargs['aligned_site_max_triplet_distance'].
        if abs(distAnt - distDnt) > kwargs['aligned_site_max_triplet_distance']*3:
            continue

        # add distance score to introns
        intQ._distance = abs(distDnt) + abs(distAnt)
        intS._distance = abs(distDnt) + abs(distAnt)

        # add Alignment Positional Periphery Score into objects
        succes = set_apps_intron_query(intQ,pacbporfD,pacbporfA)
        succes = set_apps_intron_sbjct(intS,pacbporfD,pacbporfA)

        # set GFF fsource attribute for recognition of intron sources
        intQ._gff['fsource'] = "ABGPmapping"
        intS._gff['fsource'] = "ABGPmapping"

        ########################################################################
        if verbose:
            # some printing....
            print "Aligned introns:", ( intQ.donor.pos, intQ.acceptor.pos ) ,
            print ( intS.donor.pos, intS.acceptor.pos ),
            print "DIST:", distDnt, distAnt,
            print "[%s]" % kwargs['aligned_site_max_triplet_distance'],
            print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor, intQ._apps_accep),
            print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % (
                intQ.donor.pssm_score, intS.donor.pssm_score,
                intQ.acceptor.pssm_score, intS.acceptor.pssm_score,
                )
        ########################################################################

    # return lists of aligned introns
    return algintrons
Exemplo n.º 5
0
def merge_pacbporfs_by_tinyexons(pacbporfD,
                                 pacbporfA,
                                 orfSetObjQ,
                                 orfSetObjS,
                                 verbose=False,
                                 **kwargs):
    """ """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    resultlistQ = merge_orfs_with_tinyexon(
        pacbporfD.orfQ,
        pacbporfA.orfQ,
        preceding_donor_sites=pacbporfD.orfQ._donor_sites,
        subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites,
        orflist=orfSetObjQ.orfs,
        **kwargs)
    resultlistS = merge_orfs_with_tinyexon(
        pacbporfD.orfS,
        pacbporfA.orfS,
        preceding_donor_sites=pacbporfD.orfS._donor_sites,
        subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites,
        orflist=orfSetObjS.orfs,
        **kwargs)

    # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ]
    resultdictQ, key2exonQ = _tinyexon_list_2_dict(resultlistQ)
    resultdictS, key2exonS = _tinyexon_list_2_dict(resultlistS)

    # get unique list of donors & acceptors
    donorQ = olba(list(Set([inD.donor for inD, te, inA in resultlistQ])),
                  order_by='pos')
    donorS = olba(list(Set([inD.donor for inD, te, inA in resultlistS])),
                  order_by='pos')
    accepQ = olba(list(Set([inA.acceptor for inD, te, inA in resultlistQ])),
                  order_by='pos')
    accepS = olba(list(Set([inA.acceptor for inD, te, inA in resultlistS])),
                  order_by='pos')

    ## filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical'] = True  # True
    kwargs['aligned_site_max_triplet_distance'] = 0  # 2
    algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD,
                                                   **kwargs)
    algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA,
                                                   **kwargs)

    # settings for minimal alignment entropy score
    # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!!
    min_donor_site_alignment_entropy = 0.1
    min_acceptor_site_alignment_entropy = 0.1

    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(
        algdonors,
        pacbporfD,
        'donor',
        min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(
        algacceps,
        pacbporfA,
        'acceptor',
        min_alignment_entropy=min_acceptor_site_alignment_entropy)

    # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS
    return_list = []

    ############################################################################
    if verbose:
        print "bridges constructed: ORFS:",
        print(pacbporfD.orfQ.id, pacbporfA.orfQ.id),
        print(pacbporfD.orfS.id, pacbporfA.orfS.id),
        print len(resultdictQ), len(resultdictS),
        print(len(resultlistQ), len(donorQ), len(accepQ)),
        print(len(resultlistS), len(donorS), len(accepS)),
        print(len(algdonors), len(algacceps))
    ############################################################################

    for keyQ, tinyexonQ in key2exonQ.iteritems():
        for keyS, tinyexonS in key2exonS.iteritems():
            if tinyexonQ.donor.phase != tinyexonS.donor.phase:
                continue
            if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase:
                continue
            if tinyexonQ.length != tinyexonS.length:
                continue
            # if here, then tinyexons of identical structure

            ####################################################################
            if verbose:
                print tinyexonQ.length, tinyexonQ.donor.phase,
                print(len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1])),
                print(len(resultdictS[keyS][0]), len(resultdictS[keyS][1])),
                print tinyexonQ,
                print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(),
                print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score
            ####################################################################

            donor_introns = []
            acceptor_introns = []
            for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems():
                if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]:
                    continue
                for intronDSkey, intronDS in resultdictS[keyS][0].iteritems():
                    if intronDS.donor.pos not in [
                            dS.pos for dQ, dS in algdonors
                    ]:
                        continue
                    # check if they exists as aligned sites
                    alignedkey = (intronDQ.donor.pos, intronDS.donor.pos)
                    if alignedkey not in [(dQ.pos, dS.pos)
                                          for dQ, dS in algdonors]:
                        continue
                    # if here, we have a set of introns 5' of the tinyexon
                    # which are perfectly alignable!
                    donor_introns.append((intronDQ, intronDS))

            for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems():
                if intronAQ.acceptor.pos not in [
                        aQ.pos for aQ, aS in algacceps
                ]:
                    continue
                for intronASkey, intronAS in resultdictS[keyS][1].iteritems():
                    if intronAS.acceptor.pos not in [
                            aS.pos for aQ, aS in algacceps
                    ]:
                        continue
                    # check if they exists as aligned sites
                    alignedkey = (intronAQ.acceptor.pos, intronAS.acceptor.pos)
                    if alignedkey not in [(aQ.pos, aS.pos)
                                          for aQ, aS in algacceps]:
                        continue
                    # if here, we have a set of introns 3' of the tinyexon
                    # which are perfectly alignable!
                    acceptor_introns.append((intronAQ, intronAS))

            if not len(donor_introns) or not len(acceptor_introns):
                # no aligned 5' && aligned 3' introns
                continue

            # initialize extended tinyexon PacbPORF
            from pacb import PacbP
            pacbp = PacbP(input=(
                tinyexonQ.proteinsequence(),
                tinyexonS.proteinsequence(),
                tinyexonQ.protein_start(),
                tinyexonS.protein_start(),
            ))
            pacbp.strip_unmatched_ends()
            # continue if no fraction could be aligned
            if len(pacbp) == 0: continue
            tinypacbporf = pacbp2pacbporf(pacbp, tinyexonQ.orf, tinyexonS.orf)
            tinypacbporf.extend_pacbporf_after_stops()

            ####################################################################
            if verbose:
                print tinypacbporf
                tinypacbporf.print_protein_and_dna()
                print len(donor_introns), len(acceptor_introns),
                print max([
                    dQ.donor.pssm_score + dS.donor.pssm_score
                    for dQ, dS in donor_introns
                ]),
                print max([
                    aQ.acceptor.pssm_score + aS.acceptor.pssm_score
                    for aQ, aS in acceptor_introns
                ])
            ####################################################################

            # if here, we have accepted tinyexon bridges!
            # gather them and store to return_list
            for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems():
                if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]:
                    continue
                for intronDSkey, intronDS in resultdictS[keyS][0].iteritems():
                    if intronDS.donor.pos not in [
                            dS.pos for dQ, dS in algdonors
                    ]:
                        continue
                    for intronAQkey, intronAQ in resultdictQ[keyQ][
                            1].iteritems():
                        if intronAQ.acceptor.pos not in [
                                aQ.pos for aQ, aS in algacceps
                        ]:
                            continue
                        for intronASkey, intronAS in resultdictS[keyS][
                                1].iteritems():
                            if intronAS.acceptor.pos not in [
                                    aS.pos for aQ, aS in algacceps
                            ]:
                                continue
                            ####################################################
                            # set some meta-data properties to the intron objects
                            ####################################################
                            _score_introns_obtained_by_mapping(
                                intronDQ,
                                intronDS,
                                pacbporfD,
                                tinypacbporf,
                                source='ABGPmappingTE')
                            _score_introns_obtained_by_mapping(
                                intronAQ,
                                intronAS,
                                tinypacbporf,
                                pacbporfA,
                                source='ABGPmappingTE')
                            # create _linked_to_xxx attributes
                            intronDQ._linked_to_pacbporfs = [tinypacbporf]
                            intronAQ._linked_to_pacbporfs = [tinypacbporf]
                            intronDS._linked_to_pacbporfs = [tinypacbporf]
                            intronAS._linked_to_pacbporfs = [tinypacbporf]
                            intronDQ._linked_to_introns = [intronAQ]
                            intronAQ._linked_to_introns = [intronDQ]
                            intronDS._linked_to_introns = [intronAS]
                            intronAS._linked_to_introns = [intronDS]
                            # append to tmp result list
                            return_list.append(
                                (intronDQ, intronDS, tinypacbporf, intronAQ,
                                 intronAS))

    # check if there are >1 candidate tiny exons
    # currently, we choose only to return the **best** mapped tinyexon
    if len(return_list) == 0:
        pass
    elif len(return_list) == 1:
        pass
    else:
        # only take the highest scoring candidate here
        min_distance = min([(a._distance + d._distance)
                            for a, b, c, d, e in return_list])
        pos2score = []
        for (intronDQ, intronDS, tinypacbporf, intronAQ,
             intronAS) in return_list:
            if (intronDQ._distance + intronAQ._distance) > min_distance:
                pos2score.append(0.0)
            else:
                # calculate overall pssm score
                total_pssm = 0.0
                total_pssm += intronDQ.donor.pssm_score
                total_pssm += intronDQ.acceptor.pssm_score
                total_pssm += intronDS.donor.pssm_score
                total_pssm += intronDS.acceptor.pssm_score
                total_pssm += intronAQ.donor.pssm_score
                total_pssm += intronAQ.acceptor.pssm_score
                total_pssm += intronAS.donor.pssm_score
                total_pssm += intronAS.acceptor.pssm_score
                pos2score.append(total_pssm)
        # get highest score and linked tinyexon
        max_score = max(pos2score)
        return_list = [return_list[pos2score.index(max_score)]]

    ############################################################################
    # some printing in verbose mode
    if verbose and return_list:
        (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS) = return_list[0]
        print "BEST MAPPED TINYEXON:"
        print tinypacbporf
        print tinypacbporf.query, intronDQ._distance, intronAQ._distance,
        print(intronDQ.donor.pos, intronDQ.acceptor.pos),
        print(intronDS.donor.pos, intronDS.acceptor.pos),
        print(intronAQ.donor.pos, intronAQ.acceptor.pos),
        print(intronAS.donor.pos, intronAS.acceptor.pos)
    ############################################################################

    # return the result list
    return return_list
Exemplo n.º 6
0
def merge_pacbporfs_with_introns(pacbporfD,
                                 pacbporfA,
                                 verbose=False,
                                 **kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see orfs.merge_orfs_with_intron for **kwargs
    @attention: see functions._filter_for_alignable_splice_sites for **kwargs
    @attention: see functions._filter_for_entropy for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max(
        [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
    max_accep_query_pos = min(
        [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    sdr = pacbporfD.alignment_dna_range_sbjct()
    sar = pacbporfA.alignment_dna_range_sbjct()
    min_donor_sbjct_pos = max(
        [min(sdr), max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
    max_accep_sbjct_pos = min(
        [max(sar), min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    # get list of introns
    #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,
    #        min_donor_pos   =min_donor_query_pos,
    #        max_acceptor_pos=max_accep_query_pos,**kwargs)
    #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,
    #        min_donor_pos   =min_donor_sbjct_pos,
    #        max_acceptor_pos=max_accep_sbjct_pos,**kwargs)

    # get list of introns
    intronsQ = merge_orfs_with_intron(pacbporfD.orfQ, pacbporfA.orfQ, **kwargs)
    intronsS = merge_orfs_with_intron(pacbporfD.orfS, pacbporfA.orfS, **kwargs)

    # get unique list of donors & acceptors
    donorQ = olba(list(Set([inQ.donor for inQ in intronsQ])), order_by='pos')
    donorS = olba(list(Set([inS.donor for inS in intronsS])), order_by='pos')
    accepQ = olba(list(Set([inQ.acceptor for inQ in intronsQ])),
                  order_by='pos')
    accepS = olba(list(Set([inS.acceptor for inS in intronsS])),
                  order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [d.pos for d in donorQ], "aQ1", [a.pos for a in accepQ]
        print "dS1", [d.pos for d in donorS], "aS1", [a.pos for a in accepS]
    ############################################################################

    # filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor']
    algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD,
                                                   **kwargs)
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor']
    algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA,
                                                   **kwargs)

    ############################################################################
    if verbose:
        print "dQ2", [_dq.pos for (_dq, _ds) in algdonors],
        print "aQ2", [_aq.pos for (_aq, _as) in algacceps]
        print "dS2", [_ds.pos for (_dq, _ds) in algdonors],
        print "aS2", [_as.pos for (_aq, _as) in algacceps]
    ############################################################################

    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(
        algdonors,
        pacbporfD,
        'donor',
        min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(
        algacceps,
        pacbporfA,
        'acceptor',
        min_alignment_entropy=min_acceptor_site_alignment_entropy)

    ############################################################################
    if verbose:
        print "dQ3", [_dq.pos for (_dq, _ds) in algdonors],
        print "aQ3", [_aq.pos for (_aq, _as) in algacceps]
        print "dS3", [_ds.pos for (_dq, _ds) in algdonors],
        print "aS3", [_as.pos for (_aq, _as) in algacceps]
    ############################################################################

    # make unique position lists for quick lookup in intron lists
    dQpl = Set([dQ.pos for dQ, dS in algdonors])
    dSpl = Set([dS.pos for dQ, dS in algdonors])
    aQpl = Set([aQ.pos for aQ, aS in algacceps])
    aSpl = Set([aS.pos for aQ, aS in algacceps])

    # check exterior boundaries of PacbPORFs
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()

    # now make list of aligable introns
    algintrons = []
    for intQ in intronsQ:
        # check if intron falls within the PacbPORF aligned area
        if intQ.donor.pos <= sposD.query_dna_start: continue
        if intQ.acceptor.pos >= eposA.query_dna_end: continue
        if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl:
            # Query intron occurs in list of alignable splice sites!
            for intS in intronsS:
                # check if intron falls within the PacbPORF aligned area
                if intS.donor.pos <= sposD.sbjct_dna_start: continue
                if intS.acceptor.pos >= eposA.sbjct_dna_end: continue
                if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl:
                    # Sbjct intron occurs as well in alignable splice sites!
                    if (intQ.donor,intS.donor) in algdonors and\
                    (intQ.acceptor,intS.acceptor) in algacceps:
                        # Sbjct & Query Donor & Acceptor are alignable!
                        algintrons.append((intQ, intS))

    ############################################################################
    # set some meta-data properties to the intron objects
    ############################################################################
    for intQ, intS in algintrons:
        distDnt = pacbporfD.get_distance_aligned_nucleotide_positions(
            query=intQ.donor.pos, sbjct=intS.donor.pos)
        distAnt = pacbporfA.get_distance_aligned_nucleotide_positions(
            query=intQ.acceptor.pos, sbjct=intS.acceptor.pos)

        # final distance check. kwargs['aligned_site_max_triplet_distance']
        # is applied on donor and acceptor site. This distance measured on the
        # protein sequence can be DOUBLED in case distDnt / distAnt are
        # opposite (+ and -). Check here if the protein sequence gap is
        # as well <= kwargs['aligned_site_max_triplet_distance'].
        if abs(distAnt -
               distDnt) > kwargs['aligned_site_max_triplet_distance'] * 3:
            continue

        # add distance score to introns
        intQ._distance = abs(distDnt) + abs(distAnt)
        intS._distance = abs(distDnt) + abs(distAnt)

        # add Alignment Positional Periphery Score into objects
        succes = set_apps_intron_query(intQ, pacbporfD, pacbporfA)
        succes = set_apps_intron_sbjct(intS, pacbporfD, pacbporfA)

        # set GFF fsource attribute for recognition of intron sources
        intQ._gff['fsource'] = "ABGPmapping"
        intS._gff['fsource'] = "ABGPmapping"

        ########################################################################
        if verbose:
            # some printing....
            print "Aligned introns:", (intQ.donor.pos, intQ.acceptor.pos),
            print(intS.donor.pos, intS.acceptor.pos),
            print "DIST:", distDnt, distAnt,
            print "[%s]" % kwargs['aligned_site_max_triplet_distance'],
            print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor,
                                            intQ._apps_accep),
            print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % (
                intQ.donor.pssm_score,
                intS.donor.pssm_score,
                intQ.acceptor.pssm_score,
                intS.acceptor.pssm_score,
            )
        ########################################################################

    # return lists of aligned introns
    return algintrons