Пример #1
0
def prepare_lsrcbg_and_cbg_for_gsg_insertion(cbgL,cbgR):
    """
    """
    cbgL.IS_SPLITTED    = True
    cbgL.IS_3P_SPLITTED = True
    cbgR.IS_SPLITTED    = True
    cbgR.IS_5P_SPLITTED = True
    # an lsrCBG / splitted interface is always is_optimal!
    cbgIF = CodingBlockGraphInterface(cbgL,cbgR)
    cbgL._CBGinterface3p = cbgIF
    cbgR._CBGinterface5p = cbgIF
Пример #2
0
    def create_cbginterfaces(self,
                             ignore_optimal=True,
                             ignore_compatible=True,
                             allow_phase_shift=False,
                             allow_non_canonical=False,
                             optimizetinyexoninterface=False,
                             verbose=False):
        """
        (Re)create CBGInterface objects in between CBGs in this GSG

        @type  ignore_optimal: Boolean
        @param ignore_optimal: Once a CBGInterface is optimal, do not recreate it

        @type  ignore_compatible: Boolean
        @param ignore_compatible: Once a CBGInterface is compatible, do not recreate it

        @type  allow_phase_shift: Boolean
        @param allow_phase_shift: (re)create CBGInterfaces allowing a phase shift of splice sites

        @type  allow_non_canonical: Boolean
        @param allow_non_canonical: (re)create CBGInterfaces allowing non-canonical (donor) splice sites

        @type  optimizetinyexoninterface: Boolean
        @param optimizetinyexoninterface: do a quick optimization of ths cbgIF
            (non-canonical, short suitable splice site range etc

        @type  verbose: Boolean
        @param verbose: print status messages to STDOUT

        @rtype:  Integer
        @return: number of CBGInterfaceobjects that is (re)created
        """
        RECREATED_CNT = 0
        for pos in range(1, len(self)):
            cbgD, cbgA = self[pos - 1], self[pos]
            CREATE_INTERFACE = False
            has_interface_donor = self.has_donor_cbginterface(cbgD)
            has_interface_acceptor = self.has_acceptor_cbginterface(cbgA)
            if has_interface_donor and has_interface_acceptor:
                # interface objects already exist; only (re)create when not ignore_optimal
                pass
                #if self.cbginterface_is_optimal_donor(cbgD) and self.cbginterface_is_optimal_acceptor(cbgA):
                #    if not ignore_optimal: CREATE_INTERFACE = True
                #elif self.cbginterface_is_compatible_donor(cbgD) and self.cbginterface_is_compatible_acceptor(cbgA):
                #    if not ignore_compatible: CREATE_INTERFACE = True
                #else:
                #    CREATE_INTERFACE = True
            elif has_interface_donor:
                CREATE_INTERFACE = True
            elif has_interface_acceptor:
                CREATE_INTERFACE = True
            else:
                CREATE_INTERFACE = True

            if CREATE_INTERFACE:
                cbgIF = CodingBlockGraphInterface(cbgD, cbgA)
                cbgIF.harvest_splice_sites(
                    allow_phase_shift=allow_phase_shift,
                    allow_non_canonical=allow_non_canonical)
                cbgIF.find_conserved_splice_sites()
                if optimizetinyexoninterface:
                    cbgIF.optimizetinyexoninterface()
                # and set the interface objects to the CBGs in GSG
                cbgD._CBGinterface3p = cbgIF
                cbgA._CBGinterface5p = cbgIF
                RECREATED_CNT += 1
                if verbose: print cbgIF
            else:
                if verbose: print cbgD._CBGinterface3p, "EXISTING"

        # set current first & last CBG as IS_FIRST and IS_LAST
        if len(self):
            self.codingblockgraphs[0].IS_FIRST = True
            self.codingblockgraphs[-1].IS_LAST = True

        # return counter for how much CBGInterfaces are recreated
        return RECREATED_CNT
Пример #3
0
def cbg_cexpander_inframe_intron_search(self,
        min_total_pssm_score = MIN_TOTAL_PSSM_INFRAME_INTRON,
        min_intron_nt_length = MIN_INTRON_NT_LENGTH,
        verbose=False):
        """
        @type  self: CodingBlockGraph
        @param self: CodingBlockGraph instance

        @type  min_total_pssm_score: float
        @param min_total_pssm_score: MIN_TOTAL_PSSM_INFRAME_INTRON

        @type  min_intron_nt_length: integer
        @param min_intron_nt_length: MIN_INTRON_NT_LENGTH

        @type  verbose: Boolean
        @param verbose: print status/debugging messages to STDOUT

        @rtype:  list or False
        @return: list with new (sub)CBGs or False when not splitted
        """
        ########################################################################
        if verbose:
            stw = StopWatch(name="cexpCbgIfIntron")
            stw.start()
        ########################################################################

        # return variable; list of splitted CBGs.
        return_cbg_list = [ self ]

        # create cexpander multiplealignment blocks
        cbgMA = lib_cexpander.cexpander2multiplealignment(self._cexpander,
                verbose=verbose)

        # In freak-accident cases (one in thousends of times), cexpander produces
        # unequal amount of 1's in the binarystrings. This is theoretically impossible.
        # Problem is worked on; in the meanwhile, cexpander2multiplealignment returns
        # False in these cases. Catch this here by quiting current 
        # cbg_cexpander_inframe_intron_search() function call and return False
        TODO=True
        if not cbgMA: return False

        ########################################################################
        if verbose:
            print stw.lap()
            blockscnt = len( cbgMA[ cbgMA.keys()[0] ] )
            print self
            print "BLOCKS:", blockscnt, self._cexpander.binarystring,
            print self._cexpander.projected_on
            for org in cbgMA.keys():
                print org, "\t", 
                for blockid in range(0,blockscnt):
                    if cbgMA[org][blockid].count("1") >= 1:
                        print len(cbgMA[org][blockid]), 
                    else:
                        print cbgMA[org][blockid], 
                print ""
        ########################################################################

        # loop over the aligned cexpander blocks and check the 
        # non-uniformly aligned blocks for length variation
        blockscnt  = len( cbgMA[ cbgMA.keys()[0] ] )
        oricbgomsr = self.overall_minimal_spanning_range()

        for blockid in range(0,blockscnt):
            # obtain non-uniformly aligned AA lengths for this block
            lengths = {}
            for org in cbgMA.keys():
                lengths[org] = cbgMA[org][blockid].count("0")
            # skip the uniformly aligned blocks
            if list(Set(lengths.values())) == [0]: continue
            ####################################################################
            if verbose: print stw.lap(), "lengths:", lengths
            ####################################################################

            # obtain coordinates for this area
            lsrcoords = {}
            for org in cbgMA.keys():
                node = self.node_by_organism(org)
                coordSta = min(oricbgomsr[node])
                # make summation of length of preceeding (non)aligned blocks
                for i in range(0,blockid):
                    coordSta += cbgMA[org][i].count("1") +\
                                cbgMA[org][i].count("0")
                # end coord is start coord + length of current block
                coordEnd = coordSta + lengths[org]
                lsrcoords[org] = ( coordSta, coordEnd )

            ####################################################################
            if verbose: print stw.lap(), "lsrcoords:", lsrcoords
            ####################################################################

            # translate AA lengths to NT lengths
            for k in lengths.keys(): lengths[k] = lengths[k]*3

            # check lenght discrepancy and assign putative inframe introns
            putative_inframe_intron_orgs =\
                _length_discrepancy_to_potential_inframe_introns(lengths)

            if not putative_inframe_intron_orgs:
                # no length discrepancy that can represent an inframe intron
                continue

            # organisms/genes for which an inframe intron can be an improvement
            # data dictionary. Keys: 'max_nt_length', 'min_nt_length', 
            # 'min_donor_pos', 'max_acceptor_pos', 'min_total_pssm'
            inframe_intron_criteria = {}

            # find putative inframe introns in assigned genes/organisms
            putative_inframe_introns = {}
            for org in putative_inframe_intron_orgs:
                # assign inframe intron criteria for this organism
                inframe_intron_criteria[org] = {
                    'min_nt_length'     : min_intron_nt_length,
                    'min_total_pssm'    : min_total_pssm_score,
                    'min_donor_pos'     : (min(lsrcoords[org]) - 5) * 3,
                    'max_acceptor_pos'  : (max(lsrcoords[org]) + 5) * 3,
                    }

                # search for potential introns that can be responsible for this event
                theorf = self.get_orfs_of_graph(organism=org)[0]
                introns = pacb.connecting.merge_orfs_with_intron( theorf,theorf,
                            min_intron_nt_length=min_intron_nt_length
                            )

                ################################################################
                if verbose: print "introns:", org, len(introns), "raw"
                ################################################################

                # filter introns for all outside the OMSR, to short, to long,
                # total pssm_score etc
                introns = _filter_putative_inframe_intron_list(
                        introns,org,inframe_intron_criteria)
                putative_inframe_introns[org] = introns
                ################################################################
                if verbose: print "introns:", org, len(introns), "filtered"
                ################################################################

            # check if all putative_inframe_intron_orgs have indeed introns
            # and check if all have at least a single intron phase in common
            if 0 in [ len(ill) for ill in putative_inframe_introns.values() ]:
                # no introns in one or more organisms/genes -> continue
                continue
            if len( putative_inframe_introns )> 1:
                # do phase check in all organisms/genes
                phases = Set([0,1,2])
                for org, intronlist in putative_inframe_introns.iteritems():
                    thisphases = Set([ intron.phase for intron in intronlist ])
                    phases.intersection_update(thisphases)
                if len(phases) == 0:
                    ################################################################
                    if verbose: print "no mutual phase -> no cbgIF.is_optimal()"
                    ################################################################
                    # no mutual phase -> no cbgIF.is_optimal() possible lateron
                    continue
            else:
                pass

            # if an intron in at least a single organism is still there,
            # then split the involved pacbps in the `original` cbgL, the last
            # added CBG element in the return_cbg_list, and make a (virtual)
            # deepcopy of a novel cbgL. Both CBGs have actually the SAME pacbps!
            cbgR = self.deepcopy()
            cbgL = self.deepcopy()

            # loop over the organisms/genes with inframe introns split
            # the Pacbps of these orgs in both to-become L and R CBGs 
            inframe_intron_orgs = putative_inframe_introns.keys()
            for org in inframe_intron_orgs:
                ################################################################
                if verbose:
                    print "splitting PACBPs for org:", org
                    print "L", cbgL
                    print "R", cbgL
                ################################################################
                node = self.node_by_organism(org)
                replacementsL = {}
                replacementsR = {}
                for (key,node1,node2), pacbporf in cbgL.pacbps.iteritems():
                    if node in [node1,node2]:
                        # get the pacbp of this pacbporf and split it!
                        pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                        org1 = self.organism_by_node(node1)
                        org2 = self.organism_by_node(node2)

                        if org1 in putative_inframe_introns.keys() and\
                        org2 in putative_inframe_introns.keys() and\
                        inframe_intron_orgs.index(org) > 0:
                            # already splitted; both orgs are inframe introns!
                            continue

                        # make split coordinates relative
                        splitL = lsrcoords[org1][0] - pacbp.query_start
                        splitR = lsrcoords[org1][1] - pacbp.query_start

                        pacbpL = pacb.splitting.split_pacb_on_coordinates(
                            pacbp,(splitL,splitL),returnside='left')
                        pacbpR = pacb.splitting.split_pacb_on_coordinates(
                            pacbp,(splitR,splitR),returnside='rigth')

                        # check if both cbgL and cbgR make sence
                        # if not -> return False!
                        if not pacbpL: return False
                        if not pacbpR: return False

                        ########################################################
                        if verbose:
                            print "#", node1, node2, lsrcoords[org1], 
                            print "L:", splitL, "R:", splitR
                            print pacbp
                            print pacbpL
                            print pacbpR
                        ########################################################

                        # pacbpL -> extented pacbporfL -> store to replacementsL
                        newpacbporfL = pacb.conversion.pacbp2pacbporf(pacbpL,
                                       pacbporf.orfQ,pacbporf.orfS)
                        newpacbporfL.extend_pacbporf_after_stops()
                        replacementsL[(key,node1,node2)] = newpacbporfL

                        # pacbpR -> extented pacbporfR -> store to replacementsR
                        newpacbporfR = pacb.conversion.pacbp2pacbporf(pacbpR,
                                       pacbporf.orfQ,pacbporf.orfS)
                        newpacbporfR.extend_pacbporf_after_stops()
                        replacementsR[(key,node1,node2)] = newpacbporfR


                # do the pacbporf replacements in both CBGs
                statusL = _update_cbg_with_pacbporf_replacements(
                            cbgL,replacementsL)
                statusR = _update_cbg_with_pacbporf_replacements(
                            cbgR,replacementsR)

                # check if both cbgL and cbgR make sence
                if not statusL or not statusR:
                    # return unchanged cbg status -> False
                    return False
                    


            # Verify the interface between cbgL and cbgR.
            # Most likely, the sites are nicely alignable.
            cbgIF = CodingBlockGraphInterface(cbgL,cbgR)
            cbgIF.force_intron_in_organisms( putative_inframe_introns.keys() )
            cbgIF.allow_intron_in_organisms( putative_inframe_introns.keys() )
            cbgIF.harvest_splice_sites()
            cbgIF.find_conserved_splice_sites()

            ####################################################################
            if verbose:
                print cbgL
                print cbgIF
                print cbgR
                cbgIF.interfaceproperties()
            ####################################################################
            # check the properties of the CBGinterface
            if cbgIF.optimalitycheck().count(True) >= 2:
                # yes; is_compatible and donor and/or acceptor is optimal
                cbgL._CBGinterface3p = cbgIF
                cbgR._CBGinterface5p = cbgIF
                cbgL.copy_5pcbginterface_from_othercbg(self)
                cbgR.copy_3pcbginterface_from_othercbg(self)
                return_cbg_list = [ cbgL, cbgR ]
                ################################################################
                if verbose: print "INFRAME INTRON CONFIRMED!!"
                ################################################################
            else:
                # no compatible interface... although intron(s) was/were found!
                # (at least) two options are now open:
                # 1. enforce the intron(s) and create cbgIF with _forced_ends
                # 2. ignore the intron(s) and create an intermediate lsrCBG

                # 1. is `tricky`. First, how sure is this inframe intron,
                # what type of criteria do we assume etc etc.
                # second, how to create a coorect cbgIF? It must be an
                # IS_SPLITTED interface, of which the boundaries might fall
                # outside the OMSR's of the CBGs.

                # 2. ignore the intron(s) and create an intermediate lsrCBG
                lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(cbgL,lsrCBG)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(lsrCBG,cbgR)
                cbgL.copy_5pcbginterface_from_othercbg(self)
                cbgR.copy_3pcbginterface_from_othercbg(self)
                return_cbg_list = [ cbgL, lsrCBG, cbgR ]
                ################################################################
                if verbose:
                    print "no INFRAME INTRON -> lsrCBG"
                    print cbgL
                    print " ", lsrCBG._CBGinterface5p
                    print " ", lsrCBG
                    print " ", lsrCBG._CBGinterface3p
                    print cbgR
                    self.printmultiplealignment()
                    print cbgL
                    cbgL.printmultiplealignment()
                    print cbgR
                    cbgR.printmultiplealignment()
                ################################################################

        # EOF this function.
        # return False if this CBG remained intact, list of splits when splitted
        if len(return_cbg_list) == 1:
            return False
        else:
            return return_cbg_list
Пример #4
0
    def check_lsrcbgs_for_inframe_introns(self,verbose=False):
        """
        Check the lsrCBGs in the GSG and see if these regions can better be explained by an inframe intron
        """
        INFRAME_INTRONS_PREDICTED = 0
        LSR_RECREATED             = 0
        for cbgpos in range(len(self)-1,-1,-1):
            cbg = self.codingblockgraphs[cbgpos]
            if cbg.__class__.__name__ != 'LowSimilarityRegionCodingBlockGraph':
                continue
            # do the inframe intron analyses on a lsrCBG
            inframeintrons = cbg.potentially_contains_inframe_intron(verbose=verbose)
            # aparantly it seems possible to create one or more introns in the lsrCBG
            if inframeintrons:
                # get the bordering CBGs
                prev = self.codingblockgraphs[cbgpos-1]
                next = self.codingblockgraphs[cbgpos+1]
                # make CBGInterface between prev and next;
                # reset the _IS_SPLITTED tags!
                prev._splicedonorgraph = None
                prev._CBGinterface3p   = None
                prev._forced_3p_ends   = {}
                prev.IS_3P_SPLITTED    = False
                prev.IS_SPLITTED       = prev.IS_5P_SPLITTED
                next._spliceacceptorgraph = None
                next._CBGinterface5p   = None
                next._forced_5p_ends   = {}
                next.IS_5P_SPLITTED    = False
                next.IS_SPLITTED       = next.IS_3P_SPLITTED
        
                # create an actual CBGInterface of both CBGs around the lsrCBG
                cbgIF = CodingBlockGraphInterface(prev,next)
                if verbose: print cbgIF
                # re-harvest splice sites; store ALL the intron-projected sites
                cbgIF.harvest_splice_sites(allow_phase_shift=False,store_all_projected_sites=True)
                if verbose: print cbgIF
                # now remove all non-projected splice-sites in organisms that
                # are not reported to have a potential inframe intron
                cbgIF.allow_intron_in_organisms(inframeintrons)
                cbgIF.find_conserved_splice_sites()
                if verbose:
                    print cbgIF
                    print "compatible:", cbgIF.is_compatible(), "optimal:", cbgIF.is_optimal()
                    print cbgIF._optimal_aligned_donor
                    print cbgIF._optimal_aligned_acceptor
                # yes, this is what we expect; a compatible CBGInterface!
                # this very likely represents an inframe intron!
                if cbgIF.is_compatible():
                    # remove the lsrCBG from the GSG
                    lsrCBG = self.codingblockgraphs.pop(cbgpos)
                    # set the CBGInterface object in next and prev CBG
                    prev._CBGinterface3p = cbgIF
                    next._CBGinterface5p = cbgIF
                    # increase the counter of number of inframe introns predicted
                    INFRAME_INTRONS_PREDICTED+=1
                    ############################################################
                    if verbose: print "INFRAME INTRON PREDICTED!!"
                    ############################################################

                else:
                    # nope, this does not seem like a proper inframe intron
                    # reset the CBGs and the lsrCBG objects as they were!

                    # If this point is reached, `first` and `second` are CBGs with exactly the same nodes
                    # create intermediate lsrCBG
                    prev.IS_SPLITTED    = True
                    prev.IS_3P_SPLITTED = True
                    next.IS_SPLITTED    = True
                    next.IS_5P_SPLITTED = True
                    lsrCBG = create_intermediate_lowsimilarity_region(prev,next)
                    self.codingblockgraphs[cbgpos]   = lsrCBG

                    # recreate the CBGInterfaces (I)
                    cbgIFa = CodingBlockGraphInterface(prev,lsrCBG)
                    cbgIFa.harvest_splice_sites()
                    cbgIFa.find_conserved_splice_sites()
                    # set the interface object to the CBGs in GSG
                    prev._CBGinterface3p   = cbgIFa
                    lsrCBG._CBGinterface5p = cbgIFa

                    # recreate the CBGInterfaces (II)
                    cbgIFb = CodingBlockGraphInterface(lsrCBG,next)
                    cbgIFb.harvest_splice_sites()
                    cbgIFb.find_conserved_splice_sites()
                    # set the interface object to the CBGs in GSG
                    lsrCBG._CBGinterface3p = cbgIFb
                    next._CBGinterface5p   = cbgIFb

                    ############################################################
                    if verbose: print "NO COMPATIBLE SITE!"
                    ############################################################

                    ###for org in inframeintrons:
                    ###    print org, "NO COMPATIBLE SITES FOUND!"
                    ###    print prev
                    ###    print cbgIF
                    ###    print next
                    ###    theorf = next.get_orfs_of_graph(organism = org )[0]
                    ###    print theorf
                    ###    theorf.printproteinanddna()
                    ###    for donor in theorf._donor_sites: print donor
                    ###    for acceptor in theorf._acceptor_sites: print acceptor
        
        # return number of found inframe introns
        return INFRAME_INTRONS_PREDICTED
Пример #5
0
    def search_for_lowsimilarity_regions(self,aligned_intron_min_aa_length=ALIGNED_INTRON_MIN_AA_LENGTH,verbose=False):
        """
        Search CBGs in genestructure for lowsimilarity regions
        """

        ################################################################
        if verbose:
            stw = StopWatch(name='lsrCBGsearch')
            stw.start()
        ################################################################

        # Loop reversed through genestructure to make sure that once
        # a CBG is splitted, the positions of the remainder of the
        # list stay intact.
        for posinGSG in range(len(self)-1,-1,-1):
            sg = self.codingblockgraphs[posinGSG]
            # skip IGNORED, lsrCBG and CBGs that are incomplete (still await HMM completion) 
            if sg.IS_IGNORED: continue
            if sg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue
            if sg.node_count() < self.EXACT_SG_NODE_COUNT: continue

            if verbose: print stw.lap(), posinGSG, "start"

            # check for potential aligned intron
            if sg.potentially_contains_aligned_intron(window_aa_size=aligned_intron_min_aa_length):
                ########################################################
                if verbose:
                    print stw.lap(), posinGSG, "found"
                    for k,v in sg.getomsrproteinsequences().iteritems():
                        print ">%s\n%s\n" % (k,v)
                    print "ABOUT TO SPLIT:", sg
                    print sg._cexpander.binarystring,
                    print sg._cexpander.projected_on
                    sg.printmultiplealignment()
                    for k,pacbp in sg.pacbps.iteritems(): print k, pacbp
                ########################################################
                # now actually split by inframe intron
                res = sg.split_codingblock_by_inframe_intron()
                if len(res) == 1:
                    # no inframe intron found here
                    pass
                else:
                    # prepare the CBGs for insertion 
                    for pos in range(0,len(res)):
                        splittedCBG = res[pos]
                        splittedCBG.extend_pacbporfs(self.input)
                        splittedCBG.update_edge_weights_by_minimal_spanning_range()
                        splittedCBG.IS_SPLITTED = True
                        if pos > 0:
                            splittedCBG.IS_5P_SPLITTED = True
                            splittedCBG.IS_FIRST = False
                        if pos < len(res)-1:
                            splittedCBG.IS_3P_SPLITTED = True
                            splittedCBG.IS_LAST = False
                        # (re)create the cache for the splitted CBGs
                        splittedCBG.create_cache()
                        ################################################
                        if verbose:
                            print stw.lap(), posinGSG, "done!"
                            print "SUCCESFULLY SPLITTED:", splittedCBG
                            splittedCBG.printmultiplealignment()
                            print splittedCBG._cexpander.binarystring, 
                            print splittedCBG._cexpander.projected_on
                            print splittedCBG._omsr
                            for trf in splittedCBG._cexpander._transferblocks:
                                print trf.binarystring, trf.projected_on
                            for k,v in splittedCBG._cexpander.inputsequences.iteritems():
                                print v,"\t",k
                            for _org,orflist in splittedCBG.get_orfs_of_graph().iteritems():
                                print orflist[0], _org
                            for pacbp in splittedCBG.pacbps.values():
                                print pacbp
                                pacbp.print_protein(_linesize=100)
                        ################################################

                    # create lsrCBGs and cbgIFs between them by looping in reversed
                    # order over all pairs of CBGs (because lsrCBG insertion in list)
                    for pos in range(len(res)-2,-1,-1):
                        cbgL,cbgR = res[pos:pos+2]
                        lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR)
                        res.insert(pos+1,lsrCBG)
                        # create cbgIF between the CBGs and the lsrCBG
                        # just create -> cbgIF with lsrCBG is immediately is_optimal()
                        cbgIFa = CodingBlockGraphInterface(cbgL,lsrCBG)
                        cbgIFb = CodingBlockGraphInterface(lsrCBG,cbgR)
                        # set cbgIF objects to the CBGs and the lsrCBG
                        cbgL._CBGinterface3p   = cbgIFa
                        lsrCBG._CBGinterface5p = cbgIFa
                        lsrCBG._CBGinterface3p = cbgIFb
                        cbgR._CBGinterface5p   = cbgIFb

                    # update the first and last CBG in this list with the
                    # cbgIFs of the parental CBG (variable sg)
                    res[0]._CBGinterface5p =  sg._CBGinterface5p
                    res[-1]._CBGinterface3p = sg._CBGinterface3p
                    # update the original IS_FIRST/IS_LAST status
                    res[0].IS_FIRST = sg.IS_FIRST
                    res[-1].IS_LAST = sg.IS_LAST

                    # and set splittedCBGs to genestructure
                    # by replacing the existing CBG (variable sg) on the
                    # position posinGSG with the list op splitted CBGs
                    self.codingblockgraphs.__setslice__(posinGSG,posinGSG+1,res)

            else:
                # nope, no potential inframe intron; just append
                ###print sg.total_weight(), False
                pass
Пример #6
0
    def construct_final_tiny_cbg(self,
        max_exon_nt_length=SHORT_TAILINGEXON_MAX_NT_LENGTH,
        max_intron_nt_length=SHORT_TAILINGEXON_MAX_INTRON_NT_LENGTH,
        take_max_best_acceptors=SHORT_TAILINGEXON_TAKE_MAX_BEST_ACCEPTORS,
        take_max_best_ecgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_ECGS,
        take_max_best_cbgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_CBGS,
        maximal_current_stopcodongraph_average_weight=0.90,
        minimal_last_vs_new_identity_ratio=0.80,
        maximal_cexpander_cbg_tail_uniformity_aa_length=3,
        elegiable_donor_omsr_nt_offset=21,
        verbose=False):
        """
        Make a tiny final CBG by ``shooting tiny exons into the deep``
        """
        # get current last CBG
        last = self.get_final_cbg()

        # check if final tail of this CBG is uniformaly alignable
        cxpdrOutput = cexpanderanalyses_omsr2orfend(last)
        IS_UNIFORMLY_ALIGNED = True
        for trf in cxpdrOutput._transferblocks:
            if trf.binarystring[-maximal_cexpander_cbg_tail_uniformity_aa_length:].count("0"):
                IS_UNIFORMLY_ALIGNED = False
                break

        ############################################################
        if verbose:
            print "Cexpander uniformaly aligned:",
            print maximal_cexpander_cbg_tail_uniformity_aa_length,
            print "->", IS_UNIFORMLY_ALIGNED
            print "omsr:       ", last._cexpander.projected_on,
            print last._cexpander.binarystring
            trf = cxpdrOutput.get_transfer_of_projected_on(
                    last._cexpander.projected_on)
            if trf and trf != True:
                print "omsr2orfend:", last._cexpander.projected_on,
                print trf.binarystring
        ############################################################

        if IS_UNIFORMLY_ALIGNED:
            # break out of this function. Chance of overpredicting
            # a final tiny exon is bigger then finding a True one!
            return False

        # check if the stopcodongraph is not (very) good already
        if last._stopcodongraph.average_weight() >=\
        maximal_current_stopcodongraph_average_weight:
            # break out of this function. Chance of overpredicting
            # a final tiny exon is bigger then finding a True existing one
            return False

        # start the timer (performance benchmark in verbose mode)
        stw = StopWatch(name='stwFinalECG')
        stw.start()

        # get FinalExons on elegiable Orfs based on distance towards OMSR of
        # current last CBG and minimal acceptor site score
        omsr  = last.overall_minimal_spanning_range()
        maxsr = last.maximal_spanning_range()
        ECG = ExonCollectionGraph()

        ################################################################
        if verbose:
            print "currentLAST", last
            print last._stopcodongraph
            print last._stopcodongraph.is_optimal()
            for org in last.organism_set():
                print org, last._stopcodongraph.is_optimal(organism=org)
            for organism in last.organism_set():
                node = last.node_by_organism(organism)
                theorf = last.get_orfs_of_graph(organism=organism)[0]
                print organism, "\t", node, "\t", max(omsr[node]), "\t",
                print max(maxsr[node]), theorf.endPY/3
        ################################################################

        for organism in last.organism_set():
            node = last.node_by_organism(organism)
            # calculate an offset for the acceptor position
            # variable elegiable_acceptor_omsr_nt_offset is needed to
            # enlarge the OMSR definded offset. When the OMSR is by chance
            # a few nt or aa larger than the actual exon length, the true
            # acceptor position can be erroneously abandoned.
            offset = max(omsr[node]) * 3 - elegiable_donor_omsr_nt_offset 
            theorf = last.get_orfs_of_graph(organism=organism)[0]

            # check if this final orf is self can serve as a final extension
            remaining_orf_nt_length          = (theorf.protein_endPY - max(omsr[node])) * 3
            remaining_maxsr_nt_length        = (max(maxsr[node]) - max(omsr[node])) * 3
            remaining_maxsr_tostop_nt_length = (theorf.protein_endPY - max(maxsr[node])) * 3 


            FIND_NEW_FINAL_ORFS       = True
            STORE_CURRENT_ORF_AS_FIOO = False 
            if remaining_maxsr_nt_length >= max_exon_nt_length:
                # exceptionally large maxsr on rigth side of omsr
                # store as FIOO but to NOT search for an orf extension!
                ### FIND_NEW_FINAL_ORFS       = False # discarded 17/09/2009; when poos maxsr present, overruled!
                STORE_CURRENT_ORF_AS_FIOO = True
            elif remaining_maxsr_tostop_nt_length <= 18:
                # maxsr is less then 6 AA apart from stop on current orf
                #FIND_NEW_FINAL_ORFS       = False
                STORE_CURRENT_ORF_AS_FIOO = True
            elif remaining_orf_nt_length < max_exon_nt_length:
                # final piece of unaligned sequence is a perfect HMM seed
                STORE_CURRENT_ORF_AS_FIOO = True
            else:
                pass

            if STORE_CURRENT_ORF_AS_FIOO:
                cbs = CodingBlockStart( theorf.aapos2dnapos( max(omsr[node]) ) )
                # set pssm_score to (very) high; this rewards
                # using the current Orf as the last Orf
                cbs.pssm_score = 20.0
                fioo = FinalExonOnOrf(cbs,theorf.endPY,theorf)
                node = (organism,theorf.id,fioo.start,fioo.end)
                ECG.add_node_and_object(node,fioo)
                ################################################################
                if verbose:
                    print organism,theorf.id,"self==potential last exon", remaining_orf_nt_length
                    print organism, theorf.id, fioo, fioo.start,fioo.end, theorf.endPY
                ################################################################

            if not FIND_NEW_FINAL_ORFS:
                # quit here -> no orf extension of this CBG
                continue

            # get elegiable (new) final orfs
            orflist = self.input[organism]['orfs'].get_elegiable_orfs(
                    max_orf_start=offset+max_intron_nt_length,
                    min_orf_end=offset )
            ################################################################
            if verbose:
                print organism, [ orf.id for orf in orflist ], "offset:", offset, offset/3
            ################################################################
            for orf in orflist:
                results = find_tailing_exon_on_orf(
                        theorf,orf,
                        current_donor_pos=offset,
                        max_tailingexon_nt_length=max_exon_nt_length,
                        max_tailingexon_intron_nt_length=max_intron_nt_length,
                        )
                for exon,intron in results:
                    node = (organism,orf.id,exon.start,exon.end)
                    if node not in ECG.get_nodes():
                        ECG.add_node_and_object(node,exon)
                        if verbose: print organism, node, exon

        if verbose: print stw.lap(), "Exon objects gathered", ECG.node_count()

        # now take only the best `take_max_best_acceptors`
        # because there can be quite some of them!
        for organism in ECG.organism_set():
            objects = ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True )
            for obj in objects[take_max_best_acceptors:]:
                node = (organism,obj.orf.id,obj.start,obj.end)
                ECG.del_node(node)
                if verbose: print "deleted:", node, obj.orf.id, obj.pssm_score

        ########################################################################
        if verbose:
            print stw.lap(), ">take_max_best_acceptors DELETED"
            for organism in ECG.organism_set():
                for obj in ordering.order_list_by_attribute(
                    ECG.get_organism_objects(organism),
                    order_by='pssm_score', reversed=True
                    ):
                    print "remaining", organism, obj.orf.id, obj.length, obj
        ######################################################################## 

        # only continue if all organisms are represented in the ECG
        if last.organism_set_size() > ECG.organism_set_size():
            if verbose: print "To few organisms/genes present -> return False"
            return False

        # create edges in the ECG between compatible phases and 
        # exon length, then make pacbps for these edges
        ECG.create_edges()
        ECG.make_pacbps_for_edges()
        if verbose:
            print stw.lap(), "edges + PACBPS created:", ECG.edge_count(), ECG.node_count(), len(ECG.pacbps)

        # search for complete graphs in this
        last_exon_graphs = ECG.find_fully_connected_subgraphs()

        ########################################################################
        if verbose: 
            print stw.lap(), "duration of ECG.find_fully_connected_subgraphs()",
            print len(last_exon_graphs)
        ########################################################################

        # only continue if there is an perfectly aligned last exon graph
        if not (last_exon_graphs and last_exon_graphs[0].connectivitysaturation() == 1.0):
            ####################################################################
            if verbose: print "no perfect aligned last exon graph -> return False"
            ####################################################################
            return False

        # convert to CodingBlockGraphs
        new_last_cbgs = []
        for leg in last_exon_graphs[0:take_max_best_ecgs]:
            cbg = ExonCollectionGraph2CodingBlockGraph(leg,is_last=True,lastCBG=last)
            if cbg != False and cbg != None and cbg.organism_set_size() == last.organism_set_size():
                # create cache of CBG and do final check on quality
                cbg.create_cache()
                if (cbg.total_weight() < 0 or cbg.omsrlength() <= 10) and\
                cbg._cexpander.binarystring.find("1") == -1:
                    # discard hardly alignable CBGs
                    continue
                # if here, then append this cbg as a possible novel final CBG
                new_last_cbgs.append( cbg )
                ################################################################
                if verbose: print "LEGcbg", cbg
                ################################################################

        ########################################################################
        if verbose: print stw.lap(), "ECGs converted to CBGs", len(new_last_cbgs)
        ########################################################################

        if not new_last_cbgs:
            ####################################################################
            if verbose: print "no ecgs convertable to CBGs -> return False"
            ####################################################################
            return False

        # order by total weight, get the optimal CBG and its corresponding ECG
        new_last_cbgs = ordering.order_graphlist_by_total_weight(new_last_cbgs)
        theNewLastCbg = None
        cbgIF = None


        # check all interfaces between the novel final CBGs and the previous
        # CBG. The best interface is added to the GSG!
        cbgif_accepted_new_last_cbgs = []
        already_checked_node_sets = []

        for newcbg in new_last_cbgs[0:take_max_best_cbgs]:
            lastExonGraph = newcbg._ExonCollectionGraph
            del( newcbg._ExonCollectionGraph )

            # check if it is not the extention of the current
            # last CBG (identical nodes)
            if len(last.node_set().symmetric_difference(newcbg.node_set())) == 0:
                if verbose: print "newCBG is the extention of current last CBG!!"
                continue

            # check if this combination of nodes (orfs) has not been tried already
            if newcbg.get_ordered_nodes() in already_checked_node_sets:
                ###############################################################
                if verbose: 
                    print "newCBG node set done earlier:", 
                    print newcbg.get_ordered_nodes()
                ###############################################################
                continue
            else:
                # append this set of nodes (as a list) to checklist
                already_checked_node_sets.append( newcbg.get_ordered_nodes() )

            # check if this new final tinyexon graph has a compatible interface
            # with the current last one
            cbgIF = CodingBlockGraphInterface(last,newcbg)
            cbgIF.harvest_splice_sites()
            distinct_orgs = []
            for node in lastExonGraph.get_nodes():
                exon = lastExonGraph.get_node_object(node)
                if exon.acceptor.__class__.__name__ == 'SpliceAcceptor':
                    distinct_orgs.append( lastExonGraph.organism_by_node(node) )
            cbgIF.allow_intron_in_organisms(distinct_orgs)
            cbgIF.find_conserved_splice_sites()
            # do NOT optimize -> consumes a lot of time and is helpfull
            # only in extreme cases...
            #cbgIF.optimize()

            if not cbgIF.is_compatible():
                ################################################################
                if verbose:
                    print "newCBG not a is_compatible() cbgIF"
                    print newcbg
                ################################################################
                continue

            # append to cbgif_accepted_new_last_cbgs
            newcbg._CBGinterface5p = cbgIF
            cbgif_accepted_new_last_cbgs.append(
                    (
                        cbgIF.optimalitycheck().count(True),
                        newcbg.total_weight(),
                        newcbg
                    )
                )

        ########################################################################
        if verbose:
            print stw.lap(), "cbgIFs checked %s/%s" % (
                len(cbgif_accepted_new_last_cbgs),
                len(new_last_cbgs[0:take_max_best_cbgs])
                )
        ########################################################################
        # now start by adding the highest scoring newcbg first
        cbgif_accepted_new_last_cbgs.sort()
        cbgif_accepted_new_last_cbgs.reverse()

        ########################################################################
        if verbose:
            print "candidate novel final CBGs:", len(cbgif_accepted_new_last_cbgs)
            for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs:
                print true_cnt,totalwt,newcbg._CBGinterface5p
                print newcbg
        ########################################################################

        for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs:
            # get the already created cbgIF from the newcbg graph
            cbgIF = newcbg._CBGinterface5p
    
            # now check 4 criteria:
            # (1) cbgIF.is_optimal() (2) >GTG.identity
            # (3) >STG.totalweight   (4) <STG.distance
            criteria = []
            criteria.append( cbgIF.is_optimal() )
            criteria.append( newcbg._stopcodongraph.total_weight() > last._stopcodongraph.total_weight() )
            criteria.append( newcbg.genetree().identity() > last.genetree().identity() )
            criteria.append( newcbg._stopcodongraph.stopcodon2omsrdistance() <= last._stopcodongraph.stopcodon2omsrdistance() )

            ####################################################################
            if verbose:
                print "TRYING ADDITION of final newcbg", criteria
                print true_cnt,totalwt,newcbg._CBGinterface5p
                print newcbg
            ####################################################################

            # check if there is only a single different node/orf changed in the newcbg
            # this is recognized by a symmetric_difference of size 2 
            # in this case, be very strict! This easily causes overprediction (FP) tiny exons 
            if len(last.node_set().symmetric_difference(newcbg.node_set())) == 2:
                # check if 4 criteria are valid;
                # a single False results in not accepting this new last tiny cbg
                if False in criteria:
                    if verbose: print "# NOVEL lastTinyExon discarded; single orf extension, criteria", criteria
                    # continue -> no new tiny CBG
                    continue

            # now start check the criteria.
            # if criteria[0] == True, means a fully is_optimal interface!
            # do not perform any additional check, just add!
            if criteria[0] == True:
                theNewLastCbg = newcbg
                break
            
            # total weight criterion -> new.tw() > last.tw()
            if criteria[1] == False:
                ##########################################################################
                if verbose:
                    print "# NOVEL lastTinyExon discarded; to low total weight"
                    print "#", newcbg._stopcodongraph
                ##########################################################################
                # continue -> no new tiny CBG
                continue

            # identity criterion -> allow a ratio i.s.o. new.id() > last.id()
            # this strict criterion (>) is applied for single-new-orf-CBGs
            if criteria[2] == False:
                ratio = newcbg.genetree().identity() / last.genetree().identity()
                if ratio < minimal_last_vs_new_identity_ratio:
                    ######################################################################
                    if verbose:
                        print "# NOVEL lastTinyExon discarded; to low identity"
                        print "#", newcbg._stopcodongraph, newcbg.genetree().identity()
                    ######################################################################
                    # continue -> no new tiny CBG
                    continue
 
            if criteria[3] == False:
                ##########################################################################
                if verbose:
                    print "# NOVEL lastTinyExon discarded; higher stopcodon2omsrdistance"
                    print "#", newcbg._stopcodongraph
                ##########################################################################
                # continue -> no new tiny CBG
                continue
 
            # if this point is reached, a new tiny last CBG has been found!
            theNewLastCbg = newcbg
            # break out of the for loop; store into the genestructure
            break



        # all okay -> ready for inserting the new CBG
        if theNewLastCbg and verbose:
            ################################################################################
            print "NEW FINAL TINY EXON FOUND!!"
            print theNewLastCbg
            print cbgIF, cbgIF.is_optimal(), cbgIF.is_acceptable()
            print cbgIF._optimal_aligned_donor, cbgIF.donor_phase()
            print cbgIF._optimal_aligned_acceptor, cbgIF.acceptor_phase()
            ################################################################################

        # hard-insert into the genestructure
        # using add_codingblock is likely to cause problems
        # because of the tinyness of the CBG
        if theNewLastCbg:
            for pos in range(0,len(self)):
                if self.codingblockgraphs[pos].IS_IGNORED: continue
                if self.codingblockgraphs[pos].IS_LAST:
                    thelast = self.codingblockgraphs[pos]
                    thelast.IS_LAST = False
                    newcbg.IS_LAST  = True
                    self.codingblockgraphs.insert(pos+1,theNewLastCbg)
                    # set the CBGInterface object in next and prev CBG
                    self.codingblockgraphs[pos]._CBGinterface3p = cbgIF
                    self.codingblockgraphs[pos+1]._CBGinterface5p = cbgIF
                    # break out; end of this function
                    break

            # done! return a True because newcbg is created & inserted
            return True
        else:
            # no newLastCbg found
            return False