Пример #1
0
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,pacbporfA,
    verbose=False,**kwargs):
    """
    Merge 2 PacbPORF objects by closeby independant gained introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intronQ, intronS, CIGexonPacbPORF )
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    kwargs['allow_phase_shift'] = True
    _update_kwargs(kwargs,KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['cig_max_aa_length']

    # run regular merge_pacbporfs_with_introns function
    alg_introns = merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=verbose,**kwargs)
    cig_introns = []

    if verbose:
        print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs['cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance']

    # check if there is length congruence between the cig_introns
    for intQ,intS in alg_introns:
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True)
        distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase)
        distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase)
        ########################################################################
        if verbose:
            print (intQ.donor.pos, intQ.acceptor.pos),
            print (intS.donor.pos, intS.acceptor.pos),
            print distDnt, distAnt, kwargs['max_nt_offset']
        ########################################################################
        if abs(distDnt-distAnt) > kwargs['max_nt_offset']:
            # intermediate ciigPacbPORF has query vs sbjct length discrepancy
            # *3 for AA2nt coordinate conversion, +2 to allow different phases
            # e.g. phase difference can give 1AA+2nt difference
            continue
        if intQ.donor.phase == intS.donor.phase and\
        (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if intQ.acceptor.phase == intS.acceptor.phase and\
        (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if abs(distDnt) <= 5 or abs(distDnt) <= 5:
            # most likely a splice site phase shift, not a c.i.g.
            continue

        if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\
        abs(distAnt/3) <= kwargs['cig_max_aa_length']:
            # putatively a closeby independant (intron) gain
            cig_introns.append( ( intQ, intS ) )

    ############################################################################
    if verbose:
        for intQ,intS in cig_introns:
            print "cig?:", (intQ.donor.pos, intQ.acceptor.pos),
            print (intS.donor.pos, intS.acceptor.pos)
    ############################################################################


    # return variable to store found positive cases of CIG into
    found_cig_list = []

    # check if there is some sequence similarity
    for intQ,intS in cig_introns:
        # get alignment positions around query & sbjcts splice sites
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True)
        distD = dQpos - dSpos
        distA = aQpos - aSpos
        distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase)
        distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase)

        if distDnt > 0:   # then, distAnt is as well > 0
            # QUERY is extended on the donor side
            #mode   = "SQ"
            #qStart = pacbporfD._positions[dSpos].query_pos
            #qEnd   = qStart + distD
            #sStart = pacbporfA._positions[aSpos].sbjct_pos
            #sEnd   = sStart + distD
            #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)
            mode  = "SQ"
            qEnd  = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos)
            qStart= qEnd - max([distA,distD])
            sStart= pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos)
            sEnd  = sStart + max([distA,distD])
            qSeq  = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            sSeq  = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)

        else: # distDnt and distAnt are < 0
            ## SBJCT is extended on the donor site
            #mode   = "QS"
            #qStart = pacbporfA._positions[aQpos].query_pos
            #qEnd   = qStart - distA
            #sStart = pacbporfD._positions[dQpos].sbjct_pos
            #sEnd   = sStart - distA
            #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd)
            #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd)
            mode  = "QS"
            qStart= pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos)
            qEnd  = qStart - min([distA,distD])
            sEnd  = pacbporfD.orfS.dnapos2aapos(intS.donor.pos)
            sStart= sEnd + min([distA,distD])
            qSeq  = pacbporfA.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            sSeq  = pacbporfD.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)


        headerQ = "query_%s_%s_%s" % (qStart,qEnd,qSeq)
        headerS = "sbjct_%s_%s_%s" % (sStart,sEnd,sSeq)
        headerQ = headerQ[0:20] # truncate to prevent error
        headerS = headerS[0:20] # truncate to prevent error
        if verbose:
            print mode, (distD,distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt,
            print dQpos, aQpos, dSpos, aSpos
        if not qSeq: continue # superfluous check-doublecheck for sequence
        if not sSeq: continue # superfluous check-doublecheck for sequence

        ####################################################
        # make PacbPORF with ClustalW
        ####################################################
        # align the sequences with clustalw
        seqs = { headerQ: qSeq, headerS: sSeq }
        (alignedseqs,alignment) = clustalw(seqs=seqs)

        # make pacbp from clustalw alignment
        pacbp = pacbp_from_clustalw(
                    alignment=(
                            alignedseqs[headerQ],
                            alignment,
                            alignedseqs[headerS]
                            ),
                    coords=(qStart,qEnd,sStart,sEnd)
                    )

        if not pacbp: continue

        # strip unaligned fraction of this pacbp object, then check length
        pacbp.strip_unmatched_ends()

        if len(pacbp) < kwargs['cig_min_aa_length']:
            continue
        if len(pacbp) > kwargs['cig_max_aa_length']:
            continue

        if pacbp:
            # initialize extended tiny PacbPORF caused by c.i.g.
            if distDnt > 0:
                cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfD.orfQ,pacbporfA.orfS)
            else:
                cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfA.orfQ,pacbporfD.orfS)
            cig_pacbporf.extend_pacbporf_after_stops()
            ####################################################################
            if verbose:
                print pacbp, len(pacbp)
                print cig_pacbporf
                print "CIG:", intQ
                print "CIG:", intS
                print distD, distA, distDnt, distAnt
                cig_pacbporf.print_protein_and_dna()
            ####################################################################

            ####################################################################
            # set some meta-data properties to the intron objects
            ####################################################################


            # add distance score to introns
            # The distance set in merge_pacbporfs_with_introns is large;
            # it is the actual distance between the splice sites. In CIG,
            # the measure for distance is the length difference between
            # the offset between query and sbjct measured on the cig_pacbporf
            intQ._distance = abs(distDnt-distAnt)
            intS._distance = abs(distDnt-distAnt)
    
            if distDnt > 0:   # then, distAnt is as well > 0
                # QUERY is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ,cig_pacbporf,pacbporfA)
                succes = set_apps_intron_sbjct(intS,pacbporfD,cig_pacbporf)
            else:
                # SBJCT is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ,pacbporfD,cig_pacbporf)
                succes = set_apps_intron_sbjct(intS,cig_pacbporf,pacbporfA)

            # set GFF fsource attribute for recognition of intron sources
            intQ._gff['fsource'] = "ABGPcig"
            intS._gff['fsource'] = "ABGPcig"

            # create _linked_to_xxx attributes
            intQ._linked_to_pacbporfs = [ cig_pacbporf ]
            intS._linked_to_pacbporfs = [ cig_pacbporf ]


            # append to found_cig_list
            found_cig_list.append( ( intQ, intS, cig_pacbporf ) )

        else:
            # no alignment possible -> try next
            continue
    
    # return lists of closeby_independant_introns
    return found_cig_list
Пример #2
0
def update_PCG_with_signalpexons(signalpexonseqs,
                                 PCG,
                                 OPTIONS,
                                 min_pacbporf_identityscore=0.20,
                                 verbose=True):
    """ """
    if not signalpexonseqs.has_key(OPTIONS.target): return False
    is_any_pacbporf_added = False
    for targetSPexon in signalpexonseqs[OPTIONS.target]:
        target = OPTIONS.target
        for informant, infSPlist in signalpexonseqs.iteritems():
            if informant == OPTIONS.target: continue
            # check if informant has been deleted in the meanwhile
            if informant not in PCG.organism_set(): continue
            # list to store signalp exons into
            signalpexon_pacbp_list = []
            # get ordered pacbporfs fromt he PCG
            thepacbporfs = order_pacbporf_list(
                PCG.get_pacbps_by_organisms(OPTIONS.target, informant))
            if not thepacbporfs:
                # no alignments present for this organism (can happen!)
                continue
            for informantSPexon in infSPlist:
                coords = [
                    targetSPexon.protein_start(),
                    targetSPexon.protein_end(),
                    informantSPexon.protein_start(),
                    informantSPexon.protein_end(),
                ]

                # prior to making ClustalW-PacbP, check PacbPCOORD placeability
                # into the list of pacbporfs
                pacbpCoordsObj = PacbPCOORDS(input=(
                    targetSPexon.proteinsequence(),
                    informantSPexon.proteinsequence(),
                    targetSPexon.protein_start(),
                    informantSPexon.protein_start(),
                ))

                if False in [
                        pacbpCoordsObj.is_positioned_compatibly(pacbporf)
                        for pacbporf in thepacbporfs
                ]:
                    # *NOT* placable in current ordered list of PacbPORFS
                    continue

                dist = pacbpCoordsObj.distance_towards(thepacbporfs[0])
                if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH / 3:
                    # WAY TO FAR in front of current gene structure parts.
                    # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS
                    continue
                elif dist == 0:
                    # NOT placeable in front of the rest of the PacbPORFS.
                    continue
                else:
                    pass

                    # perform ClustalW alignment on the SP exons
                    (alignedseqs,alignment) =\
                clustalw( seqs= {
                    OPTIONS.target: targetSPexon.proteinsequence(),
                    informant: informantSPexon.proteinsequence() } )

                # make pacbp from clustalw alignment
                pacbp = pacbp_from_clustalw(
                    alignment=(alignedseqs[OPTIONS.target], alignment,
                               alignedseqs[informant]),
                    coords=coords)

                # is there any alignment constructed?
                if not pacbp: continue

                # ignore (very) poor identyscore alignments
                if pacbp.identityscore < min_pacbporf_identityscore: continue

                # if here make extended pacbpORF
                signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf,
                                                     informantSPexon.orf)
                signalpexonPacbpORF.extend_pacbporf_after_stops()
                # and store in signalpexon_pacbp_list
                signalpexon_pacbp_list.append(signalpexonPacbpORF)

                ################################################################
                if verbose:
                    print alignedseqs[OPTIONS.target], OPTIONS.target
                    print alignment
                    print alignedseqs[informant], informant
                    if pacbp:
                        print pacbp, (OPTIONS.target, targetSPexon.orf.id),
                        print(informant, informantSPexon.orf.id),
                        print "DISTANCE::", dist
                        pacbp.print_protein()
                        print ""
                ################################################################

            # If there are signalpexon-guided pacbporfs found, store the one
            # with the highest bitscore
            if signalpexon_pacbp_list:
                signalpexon_pacbp_list = order_list_by_attribute(
                    signalpexon_pacbp_list, order_by='bits', reversed=True)
                # store best bitscoring pacbporf to PCG
                signalp_pacbporf = signalpexon_pacbp_list[0]
                pacbporf2PCG(signalp_pacbporf,
                             OPTIONS.target,
                             informant,
                             PCG,
                             source='SignalP-ClustalW')
                is_any_pacbporf_added = True
                ####################################################################
                if verbose:
                    print "SignalP Exon added to PCG:", signalp_pacbporf, informant
                ####################################################################
            else:
                pass

    # return pointer is_any_pacbporf_added
    return is_any_pacbporf_added
Пример #3
0
def update_PCG_with_signalpexons(signalpexonseqs,PCG,OPTIONS,
    min_pacbporf_identityscore=0.20,verbose=True):
    """ """
    if not signalpexonseqs.has_key(OPTIONS.target): return False
    is_any_pacbporf_added = False
    for targetSPexon in signalpexonseqs[OPTIONS.target]:
        target = OPTIONS.target
        for informant,infSPlist in signalpexonseqs.iteritems():
            if informant == OPTIONS.target: continue
            # check if informant has been deleted in the meanwhile
            if informant not in PCG.organism_set(): continue
            # list to store signalp exons into
            signalpexon_pacbp_list = []
            # get ordered pacbporfs fromt he PCG
            thepacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(OPTIONS.target,informant))
            if not thepacbporfs:
                # no alignments present for this organism (can happen!)
                continue
            for informantSPexon in infSPlist:
                coords  = [ targetSPexon.protein_start(),
                            targetSPexon.protein_end(),
                            informantSPexon.protein_start(),
                            informantSPexon.protein_end(), ]

                # prior to making ClustalW-PacbP, check PacbPCOORD placeability
                # into the list of pacbporfs
                pacbpCoordsObj = PacbPCOORDS(input=(
                        targetSPexon.proteinsequence(),
                        informantSPexon.proteinsequence(),
                        targetSPexon.protein_start(),
                        informantSPexon.protein_start(),
                        ) )

                if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]:
                    # *NOT* placable in current ordered list of PacbPORFS
                    continue

                dist = pacbpCoordsObj.distance_towards(thepacbporfs[0])
                if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH/3:
                    # WAY TO FAR in front of current gene structure parts.
                    # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS
                    continue
                elif dist == 0:
                    # NOT placeable in front of the rest of the PacbPORFS.
                    continue
                else:
                    pass

                # perform ClustalW alignment on the SP exons
                    (alignedseqs,alignment) =\
                clustalw( seqs= { 
                    OPTIONS.target: targetSPexon.proteinsequence(),
                    informant: informantSPexon.proteinsequence() } )

                # make pacbp from clustalw alignment
                pacbp = pacbp_from_clustalw(
                            alignment=(
                                    alignedseqs[OPTIONS.target],
                                    alignment,
                                    alignedseqs[informant]
                                    ),
                            coords=coords
                            )

                # is there any alignment constructed?
                if not pacbp: continue

                # ignore (very) poor identyscore alignments
                if pacbp.identityscore < min_pacbporf_identityscore: continue

                # if here make extended pacbpORF
                signalpexonPacbpORF = pacbp2pacbporf(pacbp,
                        targetSPexon.orf,informantSPexon.orf)
                signalpexonPacbpORF.extend_pacbporf_after_stops()
                # and store in signalpexon_pacbp_list
                signalpexon_pacbp_list.append( signalpexonPacbpORF )

                ################################################################
                if verbose:
                    print alignedseqs[OPTIONS.target], OPTIONS.target
                    print alignment
                    print alignedseqs[informant], informant
                    if pacbp:
                        print pacbp, (OPTIONS.target, targetSPexon.orf.id),
                        print (informant, informantSPexon.orf.id),
                        print "DISTANCE::", dist
                        pacbp.print_protein()
                        print ""
                ################################################################

            # If there are signalpexon-guided pacbporfs found, store the one
            # with the highest bitscore
            if signalpexon_pacbp_list:
                signalpexon_pacbp_list = order_list_by_attribute(
                        signalpexon_pacbp_list,order_by='bits',reversed=True)
                # store best bitscoring pacbporf to PCG
                signalp_pacbporf = signalpexon_pacbp_list[0]
                pacbporf2PCG(signalp_pacbporf,OPTIONS.target,informant,PCG,source='SignalP-ClustalW') 
                is_any_pacbporf_added = True
                ####################################################################
                if verbose:
                    print "SignalP Exon added to PCG:", signalp_pacbporf, informant
                ####################################################################
            else:
                pass

    # return pointer is_any_pacbporf_added
    return is_any_pacbporf_added
Пример #4
0
def clustalwinput2cbg(seqs,orfs,coords,nodes,
    matrix = None,
    minimal_overall_spanning_range_size = 3,
    verbose=False):
    """

    @type  seqs: dict
    @param seqs: dict with ORGANISM IDENTIFIER as keys, sequences as values

    @type  orfs: dict
    @param orfs: dict with ORGANISM IDENTIFIER as keys, Orf objects as values

    @type  coords: dict
    @param coords: dict with ORGANISM IDENTIFIER as keys, [ sta, end ] as values

    @type  nodes: list
    @param nodes: list with nodes corresponding to the ORGANISM IDENTIFIER in the dictionaries

    @attention: coordinates in coords should correspond to the sequneces in seqs!

    """
    # do clustalw and strip_alignment_for_exterior_gaps
    (algseqs,algm) = clustalw(seqs=seqs)
    ####################################################################
    if verbose: print seqs, "\n", algseqs, "\n", algm, "\n", coords
    ####################################################################
    _testalgseqs,_testalgm,_testcoords = strip_alignment_for_exterior_gaps(
        deepcopy(algseqs),deepcopy(algm),deepcopy(coords))
    if not _testalgm:
        ####################################################################
        if verbose: print "NO ALGM\n", seqs, "\n", _testalgseqs, "\n", _testalgm
        ####################################################################
        # alignment completely vanished by `strip_alignment_for_exterior_gaps`
        return None

    # do required import here (prevent circular imports)
    from graphAbgp.graph_codingblock import CodingBlockGraph
    from graphAbgp.exceptions import NoOverallMinimalSpanningRange
    from pacb import conversion as pacbconversion

    if not matrix:
        raise "No ProteinSimilarityMatrix applied!"

    # translate the clustalw alignment into an artificial CBG
    newcbg = CodingBlockGraph()
    newcbg.add_nodes(nodes)
    pacbp_is_none = False
    for nodeA,nodeB in newcbg.pairwisecrosscombinations_node():
        orgA = newcbg.organism_by_node(nodeA)
        orgB = newcbg.organism_by_node(nodeB)

        # create stripped alignments for this pair of sequences
        # do not forget to make deepcopies of the data structures!
        subcoords  = { orgA: coords[orgA], orgB: coords[orgB] }
        subalgseqs = { orgA: algseqs[orgA], orgB: algseqs[orgB] }
        _algseqs,_algm,_coords = strip_alignment_for_exterior_gaps(
            deepcopy(subalgseqs),deepcopy(algm),deepcopy(subcoords) )

        # recreate a pairwise ClustalW alignment string
        _algm = make_clustalw_alignment_match(
                _algseqs[orgA],_algseqs[orgB],
                matrix = matrix.matrix )

        # _algseqs keys are organisms, not nodes!
        alignment  = ( _algseqs[orgA], _algm, _algseqs[orgB] )
        paircoords = ( _coords[orgA][0], _coords[orgA][1],
                       _coords[orgB][0], _coords[orgB][1] )
        pacbp = pacbconversion.pacbp_from_clustalw(
                alignment=alignment,coords=paircoords)
        if pacbp == None:
            # pacbp is not creatable -> break i.o.t. return None
            pacbp_is_none = True
            break
        pacbporf = pacbconversion.pacbp2pacbporf(pacbp,orfs[orgA],orfs[orgB])
        ####################################################################
        if verbose:
            print orgA, orgB, pacbporf
            for item in alignment: print item
            print paircoords
        ####################################################################
        wt = pacbporf.bitscore
        pacbpkey = pacbporf.construct_unique_key(nodeA,nodeB)
        newcbg.add_edge(nodeA,nodeB,wt=wt)
        newcbg.pacbps[(pacbpkey,nodeA,nodeB)] = pacbporf

    # check if all pacbporfs are created succesfully
    if pacbp_is_none: return None

    # update edge weight by OMSR and return
    newcbg.MINIMAL_OVERAL_SPANNING_RANGE_SIZE =\
        minimal_overall_spanning_range_size

    if newcbg.has_overall_minimal_spanning_range():
        newcbg.update_edge_weights_by_minimal_spanning_range()
        try:
            newcbg.correct_pacbpgaps_nearby_omsr()
            return newcbg
        except NoOverallMinimalSpanningRange:
            return None
    else:
        return None
Пример #5
0
def WORKING_sprdif2clustalw2cbg(cbg,sprdif,SCAFFOLD_GAP_OMSR_OFFSET=0,verbose=False):
    """ """
    # gather sequence concerning the scaffold gap of the mutual nodes
    seqs, orfs, coords = {}, {}, {}
    for node in sprdif.keys():
        org = cbg.organism_by_node(node)
        sta = min( sprdif[node] ) - SCAFFOLD_GAP_OMSR_OFFSET
        end = max( sprdif[node] ) + SCAFFOLD_GAP_OMSR_OFFSET
        orf = cbg.get_orfs_of_graph(organism=org)[0]
        seq = orf.getaas(abs_pos_start=sta,abs_pos_end=end)
        seqs[org]   = seq
        orfs[org]   = orf
        coords[org] = [sta,end]

    # do clustalw and strip_alignment_for_exterior_gaps
    (_algseqs,_algm) = clustalw(seqs=seqs)
    ####################################################################
    if verbose: print seqs, "\n", _algseqs, "\n", _algm
    ####################################################################
    _algseqs,_algm,coords = strip_alignment_for_exterior_gaps(_algseqs,_algm,coords)
    if not _algm:
        ####################################################################
        if verbose: print "NO ALGM.??\n", seqs, "\n", _algseqs, "\n", _algm
        ####################################################################
        # alignment completely vanished by `strip_alignment_for_exterior_gaps`
        return None

    # do required import here (prevent circular imports)
    from graphAbgp.graph_codingblock import CodingBlockGraph
    from graphAbgp.exceptions import NoOverallMinimalSpanningRange
    from pacb import conversion as pacbconversion
    from lib_cexpander import cexpander_checkCBG4omsrbordergaps, ZeroUniformlyAlignedPositions

    # translate the clustalw alignment into an artificial CBG
    newcbg = CodingBlockGraph()
    newcbg.add_nodes(sprdif.keys())
    pacbp_is_none = False
    for nodeA,nodeB in newcbg.pairwisecrosscombinations_node():
        orgA       = cbg.organism_by_node(nodeA)
        orgB       = cbg.organism_by_node(nodeB)
        # _algseqs keys are organisms, not nodes!
        alignment  = ( _algseqs[orgA], _algm, _algseqs[orgB] )
        paircoords = ( coords[orgA][0], coords[org][1], coords[orgB][0], coords[orgB][1] )
        pacbp = pacbconversion.pacbp_from_clustalw(alignment=alignment,coords=paircoords)
        if pacbp == None:
            # pacbp is not creatable -> break i.o.t. return None
            pacbp_is_none = True
            break
        pacbporf = pacbconversion.pacbp2pacbporf(pacbp,orfs[orgA],orfs[orgB])
        wt = pacbporf.bitscore
        pacbpkey = pacbporf.construct_unique_key(nodeA,nodeB)
        newcbg.add_edge(nodeA,nodeB,wt=wt)
        newcbg.pacbps[(pacbpkey,nodeA,nodeB)] = pacbporf

    # check if all pacbporfs are created succesfully
    if pacbp_is_none: return None

    # update edge weight by OMSR and return
    newcbg.MINIMAL_OVERAL_SPANNING_RANGE_SIZE = 3
    if newcbg.has_overall_minimal_spanning_range():
        newcbg.update_edge_weights_by_minimal_spanning_range()
        try:
            newcbg.correct_pacbpgaps_nearby_omsr()
            return newcbg
        except NoOverallMinimalSpanningRange:
            return None
        #try:
        #    status = cexpander_checkCBG4omsrbordergaps(newcbg)
        #    return newcbg 
        #except NoOverallMinimalSpanningRange:
        #    return None
        #except ZeroUniformlyAlignedPositions:
        #    return None
        #except:
        #    return None
    else:
        return None
Пример #6
0
def WORKING_sprdif2clustalw2cbg(cbg,sprdif,SCAFFOLD_GAP_OMSR_OFFSET=1,verbose=False):
    """ """
    # gather sequence concerning the scaffold gap of the mutual nodes
    seqs, orfs, coords = {}, {}, {}
    for node in sprdif.keys():
        org = cbg.organism_by_node(node)
        sta = min( sprdif[node] ) - SCAFFOLD_GAP_OMSR_OFFSET
        end = max( sprdif[node] ) + SCAFFOLD_GAP_OMSR_OFFSET
        orf = cbg.get_orfs_of_graph(organism=org)[0]
        # correct a priori for out-of-range exceptions
        # due to SCAFFOLD_GAP_OMSR_OFFSET
        sta = max([ sta, orf.protein_startPY ])
        end = min([ end, orf.protein_endPY ])
        seq = orf.getaas(abs_pos_start=sta,abs_pos_end=end)
        seqs[org]   = seq
        orfs[org]   = orf
        coords[org] = [sta,end]

    # do clustalw and strip_alignment_for_exterior_gaps
    (algseqs,algm) = clustalw(seqs=seqs)
    ####################################################################
    if verbose: print seqs, "\n", algseqs, "\n", algm, "\n", coords
    ####################################################################
    _testalgseqs,_testalgm,_testcoords = strip_alignment_for_exterior_gaps(
        deepcopy(algseqs),deepcopy(algm),deepcopy(coords))
    if not _testalgm:
        ####################################################################
        if verbose: print "NO ALGM\n", seqs, "\n", _testalgseqs, "\n", _testalgm
        ####################################################################
        # alignment completely vanished by `strip_alignment_for_exterior_gaps`
        return None

    # do required import here (prevent circular imports)
    from graphAbgp.graph_codingblock import CodingBlockGraph
    from graphAbgp.exceptions import NoOverallMinimalSpanningRange
    from pacb import conversion as pacbconversion
    from lib_cexpander import cexpander_checkCBG4omsrbordergaps, ZeroUniformlyAlignedPositions

    # translate the clustalw alignment into an artificial CBG
    newcbg = CodingBlockGraph()
    newcbg.add_nodes(sprdif.keys())
    pacbp_is_none = False
    for nodeA,nodeB in newcbg.pairwisecrosscombinations_node():
        orgA = cbg.organism_by_node(nodeA)
        orgB = cbg.organism_by_node(nodeB)

        # create stripped alignments for this pair of sequences
        # do not forget to make deepcopies of the data structures!
        subcoords  = { orgA: coords[orgA], orgB: coords[orgB] }
        subalgseqs = { orgA: algseqs[orgA], orgB: algseqs[orgB] }
        _algseqs,_algm,_coords = strip_alignment_for_exterior_gaps(
            deepcopy(subalgseqs),deepcopy(algm),deepcopy(subcoords) )

        # get a/the ProteinSimilarityMatrix from the original PacbP(ORF)
        # and then recreate a pairwise ClustalW alignment string
        protsimmtrx = cbg.get_pacbps_by_nodes(node1=nodeA,node2=nodeB)[0].MATRIX
        _algm = make_clustalw_alignment_match(
                _algseqs[orgA],_algseqs[orgB],
                matrix = protsimmtrx.matrix )

        # _algseqs keys are organisms, not nodes!
        alignment  = ( _algseqs[orgA], _algm, _algseqs[orgB] )
        paircoords = ( _coords[orgA][0], _coords[orgA][1],
                       _coords[orgB][0], _coords[orgB][1] )
        pacbp = pacbconversion.pacbp_from_clustalw(
                alignment=alignment,coords=paircoords)
        if pacbp == None:
            # pacbp is not creatable -> break i.o.t. return None
            pacbp_is_none = True
            break
        pacbporf = pacbconversion.pacbp2pacbporf(pacbp,orfs[orgA],orfs[orgB])
        ####################################################################
        if verbose:
            print orgA, orgB, pacbporf
            for item in alignment: print item
            print paircoords
        ####################################################################
        wt = pacbporf.bitscore
        pacbpkey = pacbporf.construct_unique_key(nodeA,nodeB)
        newcbg.add_edge(nodeA,nodeB,wt=wt)
        newcbg.pacbps[(pacbpkey,nodeA,nodeB)] = pacbporf

    # check if all pacbporfs are created succesfully
    if pacbp_is_none: return None

    # update edge weight by OMSR and return
    newcbg.MINIMAL_OVERAL_SPANNING_RANGE_SIZE = 3
    if newcbg.has_overall_minimal_spanning_range():
        newcbg.update_edge_weights_by_minimal_spanning_range()
        try:
            newcbg.correct_pacbpgaps_nearby_omsr()
            return newcbg
        except NoOverallMinimalSpanningRange:
            return None
    else:
        return None
Пример #7
0
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,
                                                     pacbporfA,
                                                     verbose=False,
                                                     **kwargs):
    """
    Merge 2 PacbPORF objects by closeby independant gained introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intronQ, intronS, CIGexonPacbPORF )
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    kwargs['allow_phase_shift'] = True
    _update_kwargs(kwargs, KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs[
            'cig_max_aa_length']

    # run regular merge_pacbporfs_with_introns function
    alg_introns = merge_pacbporfs_with_introns(pacbporfD,
                                               pacbporfA,
                                               verbose=verbose,
                                               **kwargs)
    cig_introns = []

    if verbose:
        print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs[
            'cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance']

    # check if there is length congruence between the cig_introns
    for intQ, intS in alg_introns:
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,
                                                     forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,
                                                     forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,
                                                     forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,
                                                     forced_return=True)
        distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase)
        distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase)
        ########################################################################
        if verbose:
            print(intQ.donor.pos, intQ.acceptor.pos),
            print(intS.donor.pos, intS.acceptor.pos),
            print distDnt, distAnt, kwargs['max_nt_offset']
        ########################################################################
        if abs(distDnt - distAnt) > kwargs['max_nt_offset']:
            # intermediate ciigPacbPORF has query vs sbjct length discrepancy
            # *3 for AA2nt coordinate conversion, +2 to allow different phases
            # e.g. phase difference can give 1AA+2nt difference
            continue
        if intQ.donor.phase == intS.donor.phase and\
        (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if intQ.acceptor.phase == intS.acceptor.phase and\
        (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if abs(distDnt) <= 5 or abs(distDnt) <= 5:
            # most likely a splice site phase shift, not a c.i.g.
            continue

        if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\
        abs(distAnt/3) <= kwargs['cig_max_aa_length']:
            # putatively a closeby independant (intron) gain
            cig_introns.append((intQ, intS))

    ############################################################################
    if verbose:
        for intQ, intS in cig_introns:
            print "cig?:", (intQ.donor.pos, intQ.acceptor.pos),
            print(intS.donor.pos, intS.acceptor.pos)
    ############################################################################

    # return variable to store found positive cases of CIG into
    found_cig_list = []

    # check if there is some sequence similarity
    for intQ, intS in cig_introns:
        # get alignment positions around query & sbjcts splice sites
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,
                                                     forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,
                                                     forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,
                                                     forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,
                                                     forced_return=True)
        distD = dQpos - dSpos
        distA = aQpos - aSpos
        distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase)
        distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase)

        if distDnt > 0:  # then, distAnt is as well > 0
            # QUERY is extended on the donor side
            #mode   = "SQ"
            #qStart = pacbporfD._positions[dSpos].query_pos
            #qEnd   = qStart + distD
            #sStart = pacbporfA._positions[aSpos].sbjct_pos
            #sEnd   = sStart + distD
            #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)
            mode = "SQ"
            qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos)
            qStart = qEnd - max([distA, distD])
            sStart = pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos)
            sEnd = sStart + max([distA, distD])
            qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,
                                         abs_pos_end=qEnd)
            sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,
                                         abs_pos_end=sEnd)

        else:  # distDnt and distAnt are < 0
            ## SBJCT is extended on the donor site
            #mode   = "QS"
            #qStart = pacbporfA._positions[aQpos].query_pos
            #qEnd   = qStart - distA
            #sStart = pacbporfD._positions[dQpos].sbjct_pos
            #sEnd   = sStart - distA
            #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd)
            #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd)
            mode = "QS"
            qStart = pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos)
            qEnd = qStart - min([distA, distD])
            sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos)
            sStart = sEnd + min([distA, distD])
            qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart,
                                         abs_pos_end=qEnd)
            sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart,
                                         abs_pos_end=sEnd)

        headerQ = "query_%s_%s_%s" % (qStart, qEnd, qSeq)
        headerS = "sbjct_%s_%s_%s" % (sStart, sEnd, sSeq)
        headerQ = headerQ[0:20]  # truncate to prevent error
        headerS = headerS[0:20]  # truncate to prevent error
        if verbose:
            print mode, (
                distD, distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt,
            print dQpos, aQpos, dSpos, aSpos
        if not qSeq: continue  # superfluous check-doublecheck for sequence
        if not sSeq: continue  # superfluous check-doublecheck for sequence

        ####################################################
        # make PacbPORF with ClustalW
        ####################################################
        # align the sequences with clustalw
        seqs = {headerQ: qSeq, headerS: sSeq}
        (alignedseqs, alignment) = clustalw(seqs=seqs)

        # make pacbp from clustalw alignment
        pacbp = pacbp_from_clustalw(alignment=(alignedseqs[headerQ], alignment,
                                               alignedseqs[headerS]),
                                    coords=(qStart, qEnd, sStart, sEnd))

        if not pacbp: continue

        # strip unaligned fraction of this pacbp object, then check length
        pacbp.strip_unmatched_ends()

        if len(pacbp) < kwargs['cig_min_aa_length']:
            continue
        if len(pacbp) > kwargs['cig_max_aa_length']:
            continue

        if pacbp:
            # initialize extended tiny PacbPORF caused by c.i.g.
            if distDnt > 0:
                cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfD.orfQ,
                                              pacbporfA.orfS)
            else:
                cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfA.orfQ,
                                              pacbporfD.orfS)
            cig_pacbporf.extend_pacbporf_after_stops()
            ####################################################################
            if verbose:
                print pacbp, len(pacbp)
                print cig_pacbporf
                print "CIG:", intQ
                print "CIG:", intS
                print distD, distA, distDnt, distAnt
                cig_pacbporf.print_protein_and_dna()
            ####################################################################

            ####################################################################
            # set some meta-data properties to the intron objects
            ####################################################################

            # add distance score to introns
            # The distance set in merge_pacbporfs_with_introns is large;
            # it is the actual distance between the splice sites. In CIG,
            # the measure for distance is the length difference between
            # the offset between query and sbjct measured on the cig_pacbporf
            intQ._distance = abs(distDnt - distAnt)
            intS._distance = abs(distDnt - distAnt)

            if distDnt > 0:  # then, distAnt is as well > 0
                # QUERY is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ, cig_pacbporf, pacbporfA)
                succes = set_apps_intron_sbjct(intS, pacbporfD, cig_pacbporf)
            else:
                # SBJCT is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ, pacbporfD, cig_pacbporf)
                succes = set_apps_intron_sbjct(intS, cig_pacbporf, pacbporfA)

            # set GFF fsource attribute for recognition of intron sources
            intQ._gff['fsource'] = "ABGPcig"
            intS._gff['fsource'] = "ABGPcig"

            # create _linked_to_xxx attributes
            intQ._linked_to_pacbporfs = [cig_pacbporf]
            intS._linked_to_pacbporfs = [cig_pacbporf]

            # append to found_cig_list
            found_cig_list.append((intQ, intS, cig_pacbporf))

        else:
            # no alignment possible -> try next
            continue

    # return lists of closeby_independant_introns
    return found_cig_list