Python StopWatch.lap примеры использования

Язык программирования: Python

Пространство имен/Пакет: lib_stopwatch

Класс/Тип: StopWatch

Метод/Функция: lap

Примеров на hotexamples.com: 9

Python StopWatch.lap - 9 примеров найдено. Это лучшие примеры Python кода для lib_stopwatch.StopWatch.lap, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

StopWatch(8)

start(8)

lap(7)

Основные методы

StopWatch (8)

start (8)

lap (7)

Пример #1

Показать файл

    def search_for_lowsimilarity_regions(self,aligned_intron_min_aa_length=ALIGNED_INTRON_MIN_AA_LENGTH,verbose=False):
        """
        Search CBGs in genestructure for lowsimilarity regions
        """

        ################################################################
        if verbose:
            stw = StopWatch(name='lsrCBGsearch')
            stw.start()
        ################################################################

        # Loop reversed through genestructure to make sure that once
        # a CBG is splitted, the positions of the remainder of the
        # list stay intact.
        for posinGSG in range(len(self)-1,-1,-1):
            sg = self.codingblockgraphs[posinGSG]
            # skip IGNORED, lsrCBG and CBGs that are incomplete (still await HMM completion) 
            if sg.IS_IGNORED: continue
            if sg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue
            if sg.node_count() < self.EXACT_SG_NODE_COUNT: continue

            if verbose: print stw.lap(), posinGSG, "start"

            # check for potential aligned intron
            if sg.potentially_contains_aligned_intron(window_aa_size=aligned_intron_min_aa_length):
                ########################################################
                if verbose:
                    print stw.lap(), posinGSG, "found"
                    for k,v in sg.getomsrproteinsequences().iteritems():
                        print ">%s\n%s\n" % (k,v)
                    print "ABOUT TO SPLIT:", sg
                    print sg._cexpander.binarystring,
                    print sg._cexpander.projected_on
                    sg.printmultiplealignment()
                    for k,pacbp in sg.pacbps.iteritems(): print k, pacbp
                ########################################################
                # now actually split by inframe intron
                res = sg.split_codingblock_by_inframe_intron()
                if len(res) == 1:
                    # no inframe intron found here
                    pass
                else:
                    # prepare the CBGs for insertion 
                    for pos in range(0,len(res)):
                        splittedCBG = res[pos]
                        splittedCBG.extend_pacbporfs(self.input)
                        splittedCBG.update_edge_weights_by_minimal_spanning_range()
                        splittedCBG.IS_SPLITTED = True
                        if pos > 0:
                            splittedCBG.IS_5P_SPLITTED = True
                            splittedCBG.IS_FIRST = False
                        if pos < len(res)-1:
                            splittedCBG.IS_3P_SPLITTED = True
                            splittedCBG.IS_LAST = False
                        # (re)create the cache for the splitted CBGs
                        splittedCBG.create_cache()
                        ################################################
                        if verbose:
                            print stw.lap(), posinGSG, "done!"
                            print "SUCCESFULLY SPLITTED:", splittedCBG
                            splittedCBG.printmultiplealignment()
                            print splittedCBG._cexpander.binarystring, 
                            print splittedCBG._cexpander.projected_on
                            print splittedCBG._omsr
                            for trf in splittedCBG._cexpander._transferblocks:
                                print trf.binarystring, trf.projected_on
                            for k,v in splittedCBG._cexpander.inputsequences.iteritems():
                                print v,"\t",k
                            for _org,orflist in splittedCBG.get_orfs_of_graph().iteritems():
                                print orflist[0], _org
                            for pacbp in splittedCBG.pacbps.values():
                                print pacbp
                                pacbp.print_protein(_linesize=100)
                        ################################################

                    # create lsrCBGs and cbgIFs between them by looping in reversed
                    # order over all pairs of CBGs (because lsrCBG insertion in list)
                    for pos in range(len(res)-2,-1,-1):
                        cbgL,cbgR = res[pos:pos+2]
                        lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR)
                        res.insert(pos+1,lsrCBG)
                        # create cbgIF between the CBGs and the lsrCBG
                        # just create -> cbgIF with lsrCBG is immediately is_optimal()
                        cbgIFa = CodingBlockGraphInterface(cbgL,lsrCBG)
                        cbgIFb = CodingBlockGraphInterface(lsrCBG,cbgR)
                        # set cbgIF objects to the CBGs and the lsrCBG
                        cbgL._CBGinterface3p   = cbgIFa
                        lsrCBG._CBGinterface5p = cbgIFa
                        lsrCBG._CBGinterface3p = cbgIFb
                        cbgR._CBGinterface5p   = cbgIFb

                    # update the first and last CBG in this list with the
                    # cbgIFs of the parental CBG (variable sg)
                    res[0]._CBGinterface5p =  sg._CBGinterface5p
                    res[-1]._CBGinterface3p = sg._CBGinterface3p
                    # update the original IS_FIRST/IS_LAST status
                    res[0].IS_FIRST = sg.IS_FIRST
                    res[-1].IS_LAST = sg.IS_LAST

                    # and set splittedCBGs to genestructure
                    # by replacing the existing CBG (variable sg) on the
                    # position posinGSG with the list op splitted CBGs
                    self.codingblockgraphs.__setslice__(posinGSG,posinGSG+1,res)

            else:
                # nope, no potential inframe intron; just append
                ###print sg.total_weight(), False
                pass

Пример #2

Показать файл

def cbg_cexpander_inframe_intron_search(self,
        min_total_pssm_score = MIN_TOTAL_PSSM_INFRAME_INTRON,
        min_intron_nt_length = MIN_INTRON_NT_LENGTH,
        verbose=False):
        """
        @type  self: CodingBlockGraph
        @param self: CodingBlockGraph instance

        @type  min_total_pssm_score: float
        @param min_total_pssm_score: MIN_TOTAL_PSSM_INFRAME_INTRON

        @type  min_intron_nt_length: integer
        @param min_intron_nt_length: MIN_INTRON_NT_LENGTH

        @type  verbose: Boolean
        @param verbose: print status/debugging messages to STDOUT

        @rtype:  list or False
        @return: list with new (sub)CBGs or False when not splitted
        """
        ########################################################################
        if verbose:
            stw = StopWatch(name="cexpCbgIfIntron")
            stw.start()
        ########################################################################

        # return variable; list of splitted CBGs.
        return_cbg_list = [ self ]

        # create cexpander multiplealignment blocks
        cbgMA = lib_cexpander.cexpander2multiplealignment(self._cexpander,
                verbose=verbose)

        # In freak-accident cases (one in thousends of times), cexpander produces
        # unequal amount of 1's in the binarystrings. This is theoretically impossible.
        # Problem is worked on; in the meanwhile, cexpander2multiplealignment returns
        # False in these cases. Catch this here by quiting current 
        # cbg_cexpander_inframe_intron_search() function call and return False
        TODO=True
        if not cbgMA: return False

        ########################################################################
        if verbose:
            print stw.lap()
            blockscnt = len( cbgMA[ cbgMA.keys()[0] ] )
            print self
            print "BLOCKS:", blockscnt, self._cexpander.binarystring,
            print self._cexpander.projected_on
            for org in cbgMA.keys():
                print org, "\t", 
                for blockid in range(0,blockscnt):
                    if cbgMA[org][blockid].count("1") >= 1:
                        print len(cbgMA[org][blockid]), 
                    else:
                        print cbgMA[org][blockid], 
                print ""
        ########################################################################

        # loop over the aligned cexpander blocks and check the 
        # non-uniformly aligned blocks for length variation
        blockscnt  = len( cbgMA[ cbgMA.keys()[0] ] )
        oricbgomsr = self.overall_minimal_spanning_range()

        for blockid in range(0,blockscnt):
            # obtain non-uniformly aligned AA lengths for this block
            lengths = {}
            for org in cbgMA.keys():
                lengths[org] = cbgMA[org][blockid].count("0")
            # skip the uniformly aligned blocks
            if list(Set(lengths.values())) == [0]: continue
            ####################################################################
            if verbose: print stw.lap(), "lengths:", lengths
            ####################################################################

            # obtain coordinates for this area
            lsrcoords = {}
            for org in cbgMA.keys():
                node = self.node_by_organism(org)
                coordSta = min(oricbgomsr[node])
                # make summation of length of preceeding (non)aligned blocks
                for i in range(0,blockid):
                    coordSta += cbgMA[org][i].count("1") +\
                                cbgMA[org][i].count("0")
                # end coord is start coord + length of current block
                coordEnd = coordSta + lengths[org]
                lsrcoords[org] = ( coordSta, coordEnd )

            ####################################################################
            if verbose: print stw.lap(), "lsrcoords:", lsrcoords
            ####################################################################

            # translate AA lengths to NT lengths
            for k in lengths.keys(): lengths[k] = lengths[k]*3

            # check lenght discrepancy and assign putative inframe introns
            putative_inframe_intron_orgs =\
                _length_discrepancy_to_potential_inframe_introns(lengths)

            if not putative_inframe_intron_orgs:
                # no length discrepancy that can represent an inframe intron
                continue

            # organisms/genes for which an inframe intron can be an improvement
            # data dictionary. Keys: 'max_nt_length', 'min_nt_length', 
            # 'min_donor_pos', 'max_acceptor_pos', 'min_total_pssm'
            inframe_intron_criteria = {}

            # find putative inframe introns in assigned genes/organisms
            putative_inframe_introns = {}
            for org in putative_inframe_intron_orgs:
                # assign inframe intron criteria for this organism
                inframe_intron_criteria[org] = {
                    'min_nt_length'     : min_intron_nt_length,
                    'min_total_pssm'    : min_total_pssm_score,
                    'min_donor_pos'     : (min(lsrcoords[org]) - 5) * 3,
                    'max_acceptor_pos'  : (max(lsrcoords[org]) + 5) * 3,
                    }

                # search for potential introns that can be responsible for this event
                theorf = self.get_orfs_of_graph(organism=org)[0]
                introns = pacb.connecting.merge_orfs_with_intron( theorf,theorf,
                            min_intron_nt_length=min_intron_nt_length
                            )

                ################################################################
                if verbose: print "introns:", org, len(introns), "raw"
                ################################################################

                # filter introns for all outside the OMSR, to short, to long,
                # total pssm_score etc
                introns = _filter_putative_inframe_intron_list(
                        introns,org,inframe_intron_criteria)
                putative_inframe_introns[org] = introns
                ################################################################
                if verbose: print "introns:", org, len(introns), "filtered"
                ################################################################

            # check if all putative_inframe_intron_orgs have indeed introns
            # and check if all have at least a single intron phase in common
            if 0 in [ len(ill) for ill in putative_inframe_introns.values() ]:
                # no introns in one or more organisms/genes -> continue
                continue
            if len( putative_inframe_introns )> 1:
                # do phase check in all organisms/genes
                phases = Set([0,1,2])
                for org, intronlist in putative_inframe_introns.iteritems():
                    thisphases = Set([ intron.phase for intron in intronlist ])
                    phases.intersection_update(thisphases)
                if len(phases) == 0:
                    ################################################################
                    if verbose: print "no mutual phase -> no cbgIF.is_optimal()"
                    ################################################################
                    # no mutual phase -> no cbgIF.is_optimal() possible lateron
                    continue
            else:
                pass

            # if an intron in at least a single organism is still there,
            # then split the involved pacbps in the `original` cbgL, the last
            # added CBG element in the return_cbg_list, and make a (virtual)
            # deepcopy of a novel cbgL. Both CBGs have actually the SAME pacbps!
            cbgR = self.deepcopy()
            cbgL = self.deepcopy()

            # loop over the organisms/genes with inframe introns split
            # the Pacbps of these orgs in both to-become L and R CBGs 
            inframe_intron_orgs = putative_inframe_introns.keys()
            for org in inframe_intron_orgs:
                ################################################################
                if verbose:
                    print "splitting PACBPs for org:", org
                    print "L", cbgL
                    print "R", cbgL
                ################################################################
                node = self.node_by_organism(org)
                replacementsL = {}
                replacementsR = {}
                for (key,node1,node2), pacbporf in cbgL.pacbps.iteritems():
                    if node in [node1,node2]:
                        # get the pacbp of this pacbporf and split it!
                        pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                        org1 = self.organism_by_node(node1)
                        org2 = self.organism_by_node(node2)

                        if org1 in putative_inframe_introns.keys() and\
                        org2 in putative_inframe_introns.keys() and\
                        inframe_intron_orgs.index(org) > 0:
                            # already splitted; both orgs are inframe introns!
                            continue

                        # make split coordinates relative
                        splitL = lsrcoords[org1][0] - pacbp.query_start
                        splitR = lsrcoords[org1][1] - pacbp.query_start

                        pacbpL = pacb.splitting.split_pacb_on_coordinates(
                            pacbp,(splitL,splitL),returnside='left')
                        pacbpR = pacb.splitting.split_pacb_on_coordinates(
                            pacbp,(splitR,splitR),returnside='rigth')

                        # check if both cbgL and cbgR make sence
                        # if not -> return False!
                        if not pacbpL: return False
                        if not pacbpR: return False

                        ########################################################
                        if verbose:
                            print "#", node1, node2, lsrcoords[org1], 
                            print "L:", splitL, "R:", splitR
                            print pacbp
                            print pacbpL
                            print pacbpR
                        ########################################################

                        # pacbpL -> extented pacbporfL -> store to replacementsL
                        newpacbporfL = pacb.conversion.pacbp2pacbporf(pacbpL,
                                       pacbporf.orfQ,pacbporf.orfS)
                        newpacbporfL.extend_pacbporf_after_stops()
                        replacementsL[(key,node1,node2)] = newpacbporfL

                        # pacbpR -> extented pacbporfR -> store to replacementsR
                        newpacbporfR = pacb.conversion.pacbp2pacbporf(pacbpR,
                                       pacbporf.orfQ,pacbporf.orfS)
                        newpacbporfR.extend_pacbporf_after_stops()
                        replacementsR[(key,node1,node2)] = newpacbporfR


                # do the pacbporf replacements in both CBGs
                statusL = _update_cbg_with_pacbporf_replacements(
                            cbgL,replacementsL)
                statusR = _update_cbg_with_pacbporf_replacements(
                            cbgR,replacementsR)

                # check if both cbgL and cbgR make sence
                if not statusL or not statusR:
                    # return unchanged cbg status -> False
                    return False
                    


            # Verify the interface between cbgL and cbgR.
            # Most likely, the sites are nicely alignable.
            cbgIF = CodingBlockGraphInterface(cbgL,cbgR)
            cbgIF.force_intron_in_organisms( putative_inframe_introns.keys() )
            cbgIF.allow_intron_in_organisms( putative_inframe_introns.keys() )
            cbgIF.harvest_splice_sites()
            cbgIF.find_conserved_splice_sites()

            ####################################################################
            if verbose:
                print cbgL
                print cbgIF
                print cbgR
                cbgIF.interfaceproperties()
            ####################################################################
            # check the properties of the CBGinterface
            if cbgIF.optimalitycheck().count(True) >= 2:
                # yes; is_compatible and donor and/or acceptor is optimal
                cbgL._CBGinterface3p = cbgIF
                cbgR._CBGinterface5p = cbgIF
                cbgL.copy_5pcbginterface_from_othercbg(self)
                cbgR.copy_3pcbginterface_from_othercbg(self)
                return_cbg_list = [ cbgL, cbgR ]
                ################################################################
                if verbose: print "INFRAME INTRON CONFIRMED!!"
                ################################################################
            else:
                # no compatible interface... although intron(s) was/were found!
                # (at least) two options are now open:
                # 1. enforce the intron(s) and create cbgIF with _forced_ends
                # 2. ignore the intron(s) and create an intermediate lsrCBG

                # 1. is `tricky`. First, how sure is this inframe intron,
                # what type of criteria do we assume etc etc.
                # second, how to create a coorect cbgIF? It must be an
                # IS_SPLITTED interface, of which the boundaries might fall
                # outside the OMSR's of the CBGs.

                # 2. ignore the intron(s) and create an intermediate lsrCBG
                lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(cbgL,lsrCBG)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(lsrCBG,cbgR)
                cbgL.copy_5pcbginterface_from_othercbg(self)
                cbgR.copy_3pcbginterface_from_othercbg(self)
                return_cbg_list = [ cbgL, lsrCBG, cbgR ]
                ################################################################
                if verbose:
                    print "no INFRAME INTRON -> lsrCBG"
                    print cbgL
                    print " ", lsrCBG._CBGinterface5p
                    print " ", lsrCBG
                    print " ", lsrCBG._CBGinterface3p
                    print cbgR
                    self.printmultiplealignment()
                    print cbgL
                    cbgL.printmultiplealignment()
                    print cbgR
                    cbgR.printmultiplealignment()
                ################################################################

        # EOF this function.
        # return False if this CBG remained intact, list of splits when splitted
        if len(return_cbg_list) == 1:
            return False
        else:
            return return_cbg_list

Пример #3

Показать файл

    def construct_final_tiny_cbg(self,
        max_exon_nt_length=SHORT_TAILINGEXON_MAX_NT_LENGTH,
        max_intron_nt_length=SHORT_TAILINGEXON_MAX_INTRON_NT_LENGTH,
        take_max_best_acceptors=SHORT_TAILINGEXON_TAKE_MAX_BEST_ACCEPTORS,
        take_max_best_ecgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_ECGS,
        take_max_best_cbgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_CBGS,
        maximal_current_stopcodongraph_average_weight=0.90,
        minimal_last_vs_new_identity_ratio=0.80,
        maximal_cexpander_cbg_tail_uniformity_aa_length=3,
        elegiable_donor_omsr_nt_offset=21,
        verbose=False):
        """
        Make a tiny final CBG by ``shooting tiny exons into the deep``
        """
        # get current last CBG
        last = self.get_final_cbg()

        # check if final tail of this CBG is uniformaly alignable
        cxpdrOutput = cexpanderanalyses_omsr2orfend(last)
        IS_UNIFORMLY_ALIGNED = True
        for trf in cxpdrOutput._transferblocks:
            if trf.binarystring[-maximal_cexpander_cbg_tail_uniformity_aa_length:].count("0"):
                IS_UNIFORMLY_ALIGNED = False
                break

        ############################################################
        if verbose:
            print "Cexpander uniformaly aligned:",
            print maximal_cexpander_cbg_tail_uniformity_aa_length,
            print "->", IS_UNIFORMLY_ALIGNED
            print "omsr:       ", last._cexpander.projected_on,
            print last._cexpander.binarystring
            trf = cxpdrOutput.get_transfer_of_projected_on(
                    last._cexpander.projected_on)
            if trf and trf != True:
                print "omsr2orfend:", last._cexpander.projected_on,
                print trf.binarystring
        ############################################################

        if IS_UNIFORMLY_ALIGNED:
            # break out of this function. Chance of overpredicting
            # a final tiny exon is bigger then finding a True one!
            return False

        # check if the stopcodongraph is not (very) good already
        if last._stopcodongraph.average_weight() >=\
        maximal_current_stopcodongraph_average_weight:
            # break out of this function. Chance of overpredicting
            # a final tiny exon is bigger then finding a True existing one
            return False

        # start the timer (performance benchmark in verbose mode)
        stw = StopWatch(name='stwFinalECG')
        stw.start()

        # get FinalExons on elegiable Orfs based on distance towards OMSR of
        # current last CBG and minimal acceptor site score
        omsr  = last.overall_minimal_spanning_range()
        maxsr = last.maximal_spanning_range()
        ECG = ExonCollectionGraph()

        ################################################################
        if verbose:
            print "currentLAST", last
            print last._stopcodongraph
            print last._stopcodongraph.is_optimal()
            for org in last.organism_set():
                print org, last._stopcodongraph.is_optimal(organism=org)
            for organism in last.organism_set():
                node = last.node_by_organism(organism)
                theorf = last.get_orfs_of_graph(organism=organism)[0]
                print organism, "\t", node, "\t", max(omsr[node]), "\t",
                print max(maxsr[node]), theorf.endPY/3
        ################################################################

        for organism in last.organism_set():
            node = last.node_by_organism(organism)
            # calculate an offset for the acceptor position
            # variable elegiable_acceptor_omsr_nt_offset is needed to
            # enlarge the OMSR definded offset. When the OMSR is by chance
            # a few nt or aa larger than the actual exon length, the true
            # acceptor position can be erroneously abandoned.
            offset = max(omsr[node]) * 3 - elegiable_donor_omsr_nt_offset 
            theorf = last.get_orfs_of_graph(organism=organism)[0]

            # check if this final orf is self can serve as a final extension
            remaining_orf_nt_length          = (theorf.protein_endPY - max(omsr[node])) * 3
            remaining_maxsr_nt_length        = (max(maxsr[node]) - max(omsr[node])) * 3
            remaining_maxsr_tostop_nt_length = (theorf.protein_endPY - max(maxsr[node])) * 3 


            FIND_NEW_FINAL_ORFS       = True
            STORE_CURRENT_ORF_AS_FIOO = False 
            if remaining_maxsr_nt_length >= max_exon_nt_length:
                # exceptionally large maxsr on rigth side of omsr
                # store as FIOO but to NOT search for an orf extension!
                ### FIND_NEW_FINAL_ORFS       = False # discarded 17/09/2009; when poos maxsr present, overruled!
                STORE_CURRENT_ORF_AS_FIOO = True
            elif remaining_maxsr_tostop_nt_length <= 18:
                # maxsr is less then 6 AA apart from stop on current orf
                #FIND_NEW_FINAL_ORFS       = False
                STORE_CURRENT_ORF_AS_FIOO = True
            elif remaining_orf_nt_length < max_exon_nt_length:
                # final piece of unaligned sequence is a perfect HMM seed
                STORE_CURRENT_ORF_AS_FIOO = True
            else:
                pass

            if STORE_CURRENT_ORF_AS_FIOO:
                cbs = CodingBlockStart( theorf.aapos2dnapos( max(omsr[node]) ) )
                # set pssm_score to (very) high; this rewards
                # using the current Orf as the last Orf
                cbs.pssm_score = 20.0
                fioo = FinalExonOnOrf(cbs,theorf.endPY,theorf)
                node = (organism,theorf.id,fioo.start,fioo.end)
                ECG.add_node_and_object(node,fioo)
                ################################################################
                if verbose:
                    print organism,theorf.id,"self==potential last exon", remaining_orf_nt_length
                    print organism, theorf.id, fioo, fioo.start,fioo.end, theorf.endPY
                ################################################################

            if not FIND_NEW_FINAL_ORFS:
                # quit here -> no orf extension of this CBG
                continue

            # get elegiable (new) final orfs
            orflist = self.input[organism]['orfs'].get_elegiable_orfs(
                    max_orf_start=offset+max_intron_nt_length,
                    min_orf_end=offset )
            ################################################################
            if verbose:
                print organism, [ orf.id for orf in orflist ], "offset:", offset, offset/3
            ################################################################
            for orf in orflist:
                results = find_tailing_exon_on_orf(
                        theorf,orf,
                        current_donor_pos=offset,
                        max_tailingexon_nt_length=max_exon_nt_length,
                        max_tailingexon_intron_nt_length=max_intron_nt_length,
                        )
                for exon,intron in results:
                    node = (organism,orf.id,exon.start,exon.end)
                    if node not in ECG.get_nodes():
                        ECG.add_node_and_object(node,exon)
                        if verbose: print organism, node, exon

        if verbose: print stw.lap(), "Exon objects gathered", ECG.node_count()

        # now take only the best `take_max_best_acceptors`
        # because there can be quite some of them!
        for organism in ECG.organism_set():
            objects = ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True )
            for obj in objects[take_max_best_acceptors:]:
                node = (organism,obj.orf.id,obj.start,obj.end)
                ECG.del_node(node)
                if verbose: print "deleted:", node, obj.orf.id, obj.pssm_score

        ########################################################################
        if verbose:
            print stw.lap(), ">take_max_best_acceptors DELETED"
            for organism in ECG.organism_set():
                for obj in ordering.order_list_by_attribute(
                    ECG.get_organism_objects(organism),
                    order_by='pssm_score', reversed=True
                    ):
                    print "remaining", organism, obj.orf.id, obj.length, obj
        ######################################################################## 

        # only continue if all organisms are represented in the ECG
        if last.organism_set_size() > ECG.organism_set_size():
            if verbose: print "To few organisms/genes present -> return False"
            return False

        # create edges in the ECG between compatible phases and 
        # exon length, then make pacbps for these edges
        ECG.create_edges()
        ECG.make_pacbps_for_edges()
        if verbose:
            print stw.lap(), "edges + PACBPS created:", ECG.edge_count(), ECG.node_count(), len(ECG.pacbps)

        # search for complete graphs in this
        last_exon_graphs = ECG.find_fully_connected_subgraphs()

        ########################################################################
        if verbose: 
            print stw.lap(), "duration of ECG.find_fully_connected_subgraphs()",
            print len(last_exon_graphs)
        ########################################################################

        # only continue if there is an perfectly aligned last exon graph
        if not (last_exon_graphs and last_exon_graphs[0].connectivitysaturation() == 1.0):
            ####################################################################
            if verbose: print "no perfect aligned last exon graph -> return False"
            ####################################################################
            return False

        # convert to CodingBlockGraphs
        new_last_cbgs = []
        for leg in last_exon_graphs[0:take_max_best_ecgs]:
            cbg = ExonCollectionGraph2CodingBlockGraph(leg,is_last=True,lastCBG=last)
            if cbg != False and cbg != None and cbg.organism_set_size() == last.organism_set_size():
                # create cache of CBG and do final check on quality
                cbg.create_cache()
                if (cbg.total_weight() < 0 or cbg.omsrlength() <= 10) and\
                cbg._cexpander.binarystring.find("1") == -1:
                    # discard hardly alignable CBGs
                    continue
                # if here, then append this cbg as a possible novel final CBG
                new_last_cbgs.append( cbg )
                ################################################################
                if verbose: print "LEGcbg", cbg
                ################################################################

        ########################################################################
        if verbose: print stw.lap(), "ECGs converted to CBGs", len(new_last_cbgs)
        ########################################################################

        if not new_last_cbgs:
            ####################################################################
            if verbose: print "no ecgs convertable to CBGs -> return False"
            ####################################################################
            return False

        # order by total weight, get the optimal CBG and its corresponding ECG
        new_last_cbgs = ordering.order_graphlist_by_total_weight(new_last_cbgs)
        theNewLastCbg = None
        cbgIF = None


        # check all interfaces between the novel final CBGs and the previous
        # CBG. The best interface is added to the GSG!
        cbgif_accepted_new_last_cbgs = []
        already_checked_node_sets = []

        for newcbg in new_last_cbgs[0:take_max_best_cbgs]:
            lastExonGraph = newcbg._ExonCollectionGraph
            del( newcbg._ExonCollectionGraph )

            # check if it is not the extention of the current
            # last CBG (identical nodes)
            if len(last.node_set().symmetric_difference(newcbg.node_set())) == 0:
                if verbose: print "newCBG is the extention of current last CBG!!"
                continue

            # check if this combination of nodes (orfs) has not been tried already
            if newcbg.get_ordered_nodes() in already_checked_node_sets:
                ###############################################################
                if verbose: 
                    print "newCBG node set done earlier:", 
                    print newcbg.get_ordered_nodes()
                ###############################################################
                continue
            else:
                # append this set of nodes (as a list) to checklist
                already_checked_node_sets.append( newcbg.get_ordered_nodes() )

            # check if this new final tinyexon graph has a compatible interface
            # with the current last one
            cbgIF = CodingBlockGraphInterface(last,newcbg)
            cbgIF.harvest_splice_sites()
            distinct_orgs = []
            for node in lastExonGraph.get_nodes():
                exon = lastExonGraph.get_node_object(node)
                if exon.acceptor.__class__.__name__ == 'SpliceAcceptor':
                    distinct_orgs.append( lastExonGraph.organism_by_node(node) )
            cbgIF.allow_intron_in_organisms(distinct_orgs)
            cbgIF.find_conserved_splice_sites()
            # do NOT optimize -> consumes a lot of time and is helpfull
            # only in extreme cases...
            #cbgIF.optimize()

            if not cbgIF.is_compatible():
                ################################################################
                if verbose:
                    print "newCBG not a is_compatible() cbgIF"
                    print newcbg
                ################################################################
                continue

            # append to cbgif_accepted_new_last_cbgs
            newcbg._CBGinterface5p = cbgIF
            cbgif_accepted_new_last_cbgs.append(
                    (
                        cbgIF.optimalitycheck().count(True),
                        newcbg.total_weight(),
                        newcbg
                    )
                )

        ########################################################################
        if verbose:
            print stw.lap(), "cbgIFs checked %s/%s" % (
                len(cbgif_accepted_new_last_cbgs),
                len(new_last_cbgs[0:take_max_best_cbgs])
                )
        ########################################################################
        # now start by adding the highest scoring newcbg first
        cbgif_accepted_new_last_cbgs.sort()
        cbgif_accepted_new_last_cbgs.reverse()

        ########################################################################
        if verbose:
            print "candidate novel final CBGs:", len(cbgif_accepted_new_last_cbgs)
            for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs:
                print true_cnt,totalwt,newcbg._CBGinterface5p
                print newcbg
        ########################################################################

        for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs:
            # get the already created cbgIF from the newcbg graph
            cbgIF = newcbg._CBGinterface5p
    
            # now check 4 criteria:
            # (1) cbgIF.is_optimal() (2) >GTG.identity
            # (3) >STG.totalweight   (4) <STG.distance
            criteria = []
            criteria.append( cbgIF.is_optimal() )
            criteria.append( newcbg._stopcodongraph.total_weight() > last._stopcodongraph.total_weight() )
            criteria.append( newcbg.genetree().identity() > last.genetree().identity() )
            criteria.append( newcbg._stopcodongraph.stopcodon2omsrdistance() <= last._stopcodongraph.stopcodon2omsrdistance() )

            ####################################################################
            if verbose:
                print "TRYING ADDITION of final newcbg", criteria
                print true_cnt,totalwt,newcbg._CBGinterface5p
                print newcbg
            ####################################################################

            # check if there is only a single different node/orf changed in the newcbg
            # this is recognized by a symmetric_difference of size 2 
            # in this case, be very strict! This easily causes overprediction (FP) tiny exons 
            if len(last.node_set().symmetric_difference(newcbg.node_set())) == 2:
                # check if 4 criteria are valid;
                # a single False results in not accepting this new last tiny cbg
                if False in criteria:
                    if verbose: print "# NOVEL lastTinyExon discarded; single orf extension, criteria", criteria
                    # continue -> no new tiny CBG
                    continue

            # now start check the criteria.
            # if criteria[0] == True, means a fully is_optimal interface!
            # do not perform any additional check, just add!
            if criteria[0] == True:
                theNewLastCbg = newcbg
                break
            
            # total weight criterion -> new.tw() > last.tw()
            if criteria[1] == False:
                ##########################################################################
                if verbose:
                    print "# NOVEL lastTinyExon discarded; to low total weight"
                    print "#", newcbg._stopcodongraph
                ##########################################################################
                # continue -> no new tiny CBG
                continue

            # identity criterion -> allow a ratio i.s.o. new.id() > last.id()
            # this strict criterion (>) is applied for single-new-orf-CBGs
            if criteria[2] == False:
                ratio = newcbg.genetree().identity() / last.genetree().identity()
                if ratio < minimal_last_vs_new_identity_ratio:
                    ######################################################################
                    if verbose:
                        print "# NOVEL lastTinyExon discarded; to low identity"
                        print "#", newcbg._stopcodongraph, newcbg.genetree().identity()
                    ######################################################################
                    # continue -> no new tiny CBG
                    continue
 
            if criteria[3] == False:
                ##########################################################################
                if verbose:
                    print "# NOVEL lastTinyExon discarded; higher stopcodon2omsrdistance"
                    print "#", newcbg._stopcodongraph
                ##########################################################################
                # continue -> no new tiny CBG
                continue
 
            # if this point is reached, a new tiny last CBG has been found!
            theNewLastCbg = newcbg
            # break out of the for loop; store into the genestructure
            break



        # all okay -> ready for inserting the new CBG
        if theNewLastCbg and verbose:
            ################################################################################
            print "NEW FINAL TINY EXON FOUND!!"
            print theNewLastCbg
            print cbgIF, cbgIF.is_optimal(), cbgIF.is_acceptable()
            print cbgIF._optimal_aligned_donor, cbgIF.donor_phase()
            print cbgIF._optimal_aligned_acceptor, cbgIF.acceptor_phase()
            ################################################################################

        # hard-insert into the genestructure
        # using add_codingblock is likely to cause problems
        # because of the tinyness of the CBG
        if theNewLastCbg:
            for pos in range(0,len(self)):
                if self.codingblockgraphs[pos].IS_IGNORED: continue
                if self.codingblockgraphs[pos].IS_LAST:
                    thelast = self.codingblockgraphs[pos]
                    thelast.IS_LAST = False
                    newcbg.IS_LAST  = True
                    self.codingblockgraphs.insert(pos+1,theNewLastCbg)
                    # set the CBGInterface object in next and prev CBG
                    self.codingblockgraphs[pos]._CBGinterface3p = cbgIF
                    self.codingblockgraphs[pos+1]._CBGinterface5p = cbgIF
                    # break out; end of this function
                    break

            # done! return a True because newcbg is created & inserted
            return True
        else:
            # no newLastCbg found
            return False

Пример #4

Показать файл

Файл: abgpdbwarehouseminer.py Проект: dongqing7/ABFGP

    def mine(self, identifier, verbose=None):
        """ """
        # (re)set mined results to empty
        self._data = []
        self._loci = []

        # start timer
        stw = StopWatch("dbwarehouseMiner.mine('%s')" % identifier)
        if verbose: print stw.start()

        # find the current identifier in the warehouse
        identifier = identifier.replace("'", "").replace('"', '').strip()
        if not identifier: return False
        genomedir = self.identifier2genomedir(identifier)
        if not genomedir: return False

        # append the main/central locusdir to the loci
        locusdir = self.identifier2locusdir(identifier, genomedir=genomedir)
        if not locusdir: return False
        self._loci.append(locusdir)

        if verbose: print stw.lap(), "main locus identified"

        # now mine in the warehouse
        if self.SEARCH_METHOD != 'SIMILARITY':
            # set some column restraints as VERY strict (&&) i.s.o loose (||)
            column_restrain = "&&"
        else:
            column_restrain = "||"

        ####genomedirtag   = os.path.basename(os.path.split(genomedir)[0])
        genomedirtag = os.path.basename(genomedir)
        blastarchpatAB = os.path.join(
            self.dbwarehouse_path, "_crossblastp",
            "blast.%s_x_*.symmetrized" % (genomedirtag))
        blastarchpatBA = os.path.join(
            self.dbwarehouse_path, "_crossblastp",
            "blast.*_x_%s.symmetrized" % (genomedirtag))

        basecommand = """ awk -F':' '{ print $1"\\t"$2 }' | awk """ +\
                """ '{ if (($5>=%1.3f %s $6>=%1.3f) && ($7>=%1.3f %s $8>=%1.3f) && """ % (
                    self.MINIMAL_OVERLAP_RATIO,
                    column_restrain,
                    self.MINIMAL_OVERLAP_RATIO,
                    self.MINIMAL_BITSCORE_RATIO,
                    column_restrain,
                    self.MINIMAL_BITSCORE_RATIO,
                    ) +\
                """ (($5/$6)<=%1.2f %s ($6/$5)<=%1.2f)) { print $0"\t"(($5+$6)*$4)/2 } }' """ % (
                    self.MAXIMAL_LENGTH_RATIO,
                    column_restrain,
                    self.MAXIMAL_LENGTH_RATIO,
                    ) +\
                """ | sort -gr -k 9 """

        # commands with grep and zgrep for *.symmetrized and *.symmetrized.gz files
        command_grep = """grep "%s" %s %s | sort -u | %s""" % (
            identifier, blastarchpatAB, blastarchpatBA, basecommand)
        command_zgrep = """zgrep "%s" %s %s | sort -u | %s""" % (
            identifier, blastarchpatAB + ".gz", blastarchpatBA + ".gz",
            basecommand)

        # run the grep command
        ci, co, ce = os.popen3(command_grep)
        ci.close()
        lines = co.readlines()
        co.close()
        ce.close()

        # run the zgrep command
        ci, co, ce = os.popen3(command_zgrep)
        ci.close()
        lines.extend(co.readlines())
        co.close()
        ce.close()

        seentags = []
        ignoretags = []
        for line in lines:
            fname, idA, idB, bitscore, overlapA, overlapB, ratioA, ratioB, order = line.strip(
            ).split("\t")
            if fname.find(".symmetrized.gz") >= 0:
                # process the lines obtained with the zgrep command
                tagA, tagB = fname[0:fname.find(".symmetrized.gz"
                                                )][fname.find("/blast.") +
                                                   7:].split("_x_")
            else:
                # process the lines obtained with the (normal) grep command
                tagA, tagB = fname[0:fname.find(".symmetrized"
                                                )][fname.find("/blast.") +
                                                   7:].split("_x_")

            # ignore the line completely when a limitation on genomedirs is applied and valid
            if self.genometags_to_use:
                if not (tagA in self.genometags_to_use
                        and tagB in self.genometags_to_use):
                    continue
            if self.genometags_to_ignore:
                if tagA in self.genometags_to_ignore or tagB in self.genometags_to_ignore:
                    continue

            # ignore this line when one of the tags are (in) ignoretags
            if tagA in ignoretags: continue
            if tagB in ignoretags: continue

            # swap tagA & tagB when the tag's are in reversed order
            # this is due to the dbwarehouse crossblastp files
            # blast.B_x_A.symmetrized.gz isa symbolic link to
            # blast.A_x_B.symmetrized.gz if B > A (in string order)
            ordered_tags = [tagA, tagB]
            ordered_tags.sort()
            if [tagA, tagB] != ordered_tags:
                # swap tagA & tagB
                tagA, tagB = tagB, tagA

            if self.SEARCH_METHOD == 'HOMOLOGS':
                if self.ALLOW_PARALOGS:
                    pass
                else:
                    if tagA == tagB:
                        continue
                    if tagA in seentags and tagB in seentags:
                        continue
            elif self.SEARCH_METHOD == 'BDBH':
                if tagA == tagB and self.ALLOW_PARALOGS:
                    if tagA in [tup[0] for tup in self._data]:
                        continue  # there is already a fine hit gathered
                    else:
                        pass
                else:
                    if tagA == tagB and not self.ALLOW_PARALOGS:
                        continue
                    if tagA in seentags and tagB in seentags:
                        continue

            elif self.SEARCH_METHOD == 'SAFEORTHOLOGS':
                if tagA == tagB:
                    # check if there is not a paralog in the identifier's species it self
                    # that is to close nearby this identifier (a hypothetical paralogue)
                    ratioA, ratioB = float(ratioA), float(ratioB)
                    if max([ratioA, ratioB]) > self.SAFEORTHOLOGS_RATIO:
                        # there is in its own genome a hypothetical paralogue!
                        # empty data and break out!
                        self._data = []
                        break
                    else:
                        continue

                elif tagA in seentags and tagB in seentags:
                    if idA == identifier:
                        ratio = float(ratioA)
                        thetag = tagB
                    else:
                        ratio = float(ratioB)
                        thetag = tagA
                    maxratio = self._getfromdata(self._data, thetag)[5]
                    if min([ratio / maxratio, maxratio / ratio
                            ]) > self.SAFEORTHOLOGS_RATIO:
                        # remove this tag from data -> ortholog assignment is not 100% shure!
                        self._removefromdata(self._data, thetag)
                        ignoretags.append(thetag)
                        continue
                    else:
                        continue
                else:
                    pass

            else:
                # mode similarity -> all hits are okay
                pass

            # append tags to seentags
            if tagA not in seentags: seentags.append(tagA)
            if tagB not in seentags: seentags.append(tagB)

            # if here, a similar protein is mined!
            # gather locusdir and similarity data
            bitscore = int(float(bitscore))
            overlapA = float(overlapA)
            overlapB = float(overlapB)
            ratioA = float(ratioA)
            ratioB = float(ratioB)
            #if idA == identifier:
            #    self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ))
            #    ###print line.strip()
            #    ###print "A", self._data[-1],"\n"
            #else:
            #    self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA ))
            #    ###print line.strip()
            #    ###print "B", self._data[-1],"\n"

            if idA == identifier:
                self._data.append(
                    (tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB))
            elif idB == identifier:
                self._data.append(
                    (tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA))
            elif idA.find(identifier) == 0:
                self._data.append(
                    (tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB))
            elif idB.find(identifier) == 0:
                self._data.append(
                    (tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA))
            else:
                print "WHAT ELSE!?::", tagA, tagB, idA, idB, bitscore, overlapA, overlapB, ratioA, ratioB

        # remove the TEMPORARILY element in mode SAFEORTHOLOGS
        if self.SEARCH_METHOD == 'SAFEORTHOLOGS':
            self._removefromdata(self._data, genomedirtag)

        # order _data on bitscore
        tmpdata = []
        for item in self._data:
            tmpdata.append((item[2], item))
        tmpdata.sort()
        tmpdata.reverse()
        self._data = [item for (s, item) in tmpdata]

        print len(self._data), self.maximal_num_loci

        # remove _data elements when self.maximal_num_loci is exceeded
        if len(self._data) > self.maximal_num_loci - 1:
            if (self.verbose and verbose == None) or verbose:
                # print the removed loci to screen
                print "# removed loci (%s): --maximal_num_loci (%s) exceeded" % (
                    len(self._data) - self.maximal_num_loci + 1,
                    self.maximal_num_loci)
                for tup in self._data[self.maximal_num_loci - 1:]:
                    row = list(tup)
                    row.insert(0, genomedirtag)
                    row.insert(2, identifier)
                    print "\t".join([str(elem) for elem in row])
            # now actually remove the rows from _data
            # minus 1 is for the --identifier locus itself
            self._data = self._data[0:self.maximal_num_loci - 1]

        # get the loci belonging to the mined similar proteins
        for (tagB, idB, bitscore, overlapA, overlapB, ratioA,
             ratioB) in self._data:
            tagBgenomedir = os.path.join(self.dbwarehouse_path, tagB)
            locusdir = self.identifier2locusdir(idB, genomedir=tagBgenomedir)
            if not locusdir: print "HEROOO...."
            self._loci.append(locusdir)

        # add genomedirtag and identifier to _data rows
        for i in range(0, len(self._data)):
            row = list(self._data[i])
            row.insert(0, genomedirtag)
            row.insert(2, identifier)
            self._data[i] = tuple(row)

        if (self.verbose and verbose == None) or verbose:
            # print the results!
            print "# main (1th) and mined loci"
            for locus in self._loci:
                print locus
            print "# similarity data"
            #for ( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ) in self._data:
            #    print "\t".join([ str(elem) for elem in [genomedirtag, tagB, identifier, idB, bitscore, overlapA, overlapB, ratioA, ratioB ]])
            for row in self._data:
                print "\t".join([str(elem) for elem in row])
            print "# settings/options"
            print "seentags:  ", seentags
            print "ignoretags:", ignoretags
            print "use:       ", self.genometags_to_use
            print "ignore:    ", self.genometags_to_ignore
            print "# timing/performace"
            print stw.lap()

        return self._loci, self._data

Пример #5

Показать файл

Файл: abgpdbwarehouseminer.py Проект: IanReid/ABFGP

    def mine(self,identifier,verbose=None):
        """ """
        # (re)set mined results to empty
        self._data = []
        self._loci = []

        # start timer
        stw = StopWatch("dbwarehouseMiner.mine('%s')" % identifier )
        if verbose: print stw.start()

        # find the current identifier in the warehouse
        identifier = identifier.replace("'","").replace('"','').strip()
        if not identifier: return False
        genomedir = self.identifier2genomedir(identifier)
        if not genomedir: return False

        # append the main/central locusdir to the loci
        locusdir = self.identifier2locusdir(identifier,genomedir=genomedir)
        if not locusdir: return False
        self._loci.append( locusdir )

        if verbose: print stw.lap(), "main locus identified"

        # now mine in the warehouse
        if self.SEARCH_METHOD != 'SIMILARITY':
            # set some column restraints as VERY strict (&&) i.s.o loose (||)
            column_restrain = "&&"
        else:
            column_restrain = "||"

        ####genomedirtag   = os.path.basename(os.path.split(genomedir)[0])
        genomedirtag   = os.path.basename(genomedir)
        blastarchpatAB = os.path.join(self.dbwarehouse_path,"_crossblastp","blast.%s_x_*.symmetrized" % (genomedirtag))
        blastarchpatBA = os.path.join(self.dbwarehouse_path,"_crossblastp","blast.*_x_%s.symmetrized" % (genomedirtag))

        basecommand = """ awk -F':' '{ print $1"\\t"$2 }' | awk """ +\
                """ '{ if (($5>=%1.3f %s $6>=%1.3f) && ($7>=%1.3f %s $8>=%1.3f) && """ % (
                    self.MINIMAL_OVERLAP_RATIO,
                    column_restrain,
                    self.MINIMAL_OVERLAP_RATIO,
                    self.MINIMAL_BITSCORE_RATIO,
                    column_restrain,
                    self.MINIMAL_BITSCORE_RATIO,
                    ) +\
                """ (($5/$6)<=%1.2f %s ($6/$5)<=%1.2f)) { print $0"\t"(($5+$6)*$4)/2 } }' """ % (
                    self.MAXIMAL_LENGTH_RATIO,
                    column_restrain,
                    self.MAXIMAL_LENGTH_RATIO,
                    ) +\
                """ | sort -gr -k 9 """

        # commands with grep and zgrep for *.symmetrized and *.symmetrized.gz files
        command_grep = """grep "%s" %s %s | sort -u | %s""" % (
                identifier,
                blastarchpatAB,blastarchpatBA,
                basecommand)
        command_zgrep = """zgrep "%s" %s %s | sort -u | %s""" % (
                identifier,
                blastarchpatAB+".gz",blastarchpatBA+".gz",
                basecommand)

        # run the grep command
        ci,co,ce = os.popen3(command_grep)
        ci.close()
        lines = co.readlines()
        co.close()
        ce.close()

        # run the zgrep command
        ci,co,ce = os.popen3(command_zgrep)
        ci.close()
        lines.extend( co.readlines() )
        co.close()
        ce.close()

        seentags = []
        ignoretags = []
        for line in lines:
            fname, idA, idB, bitscore, overlapA, overlapB, ratioA, ratioB, order = line.strip().split("\t")
            if fname.find(".symmetrized.gz") >= 0:
                # process the lines obtained with the zgrep command
                tagA,tagB = fname[0:fname.find(".symmetrized.gz")][fname.find("/blast.")+7:].split("_x_")
            else:
                # process the lines obtained with the (normal) grep command
                tagA,tagB = fname[0:fname.find(".symmetrized")][fname.find("/blast.")+7:].split("_x_")

            # ignore the line completely when a limitation on genomedirs is applied and valid
            if self.genometags_to_use:
                if not (tagA in self.genometags_to_use and tagB in self.genometags_to_use):
                    continue
            if self.genometags_to_ignore:
                if tagA in self.genometags_to_ignore or tagB in self.genometags_to_ignore:
                    continue

            # ignore this line when one of the tags are (in) ignoretags
            if tagA in ignoretags: continue
            if tagB in ignoretags: continue

            # swap tagA & tagB when the tag's are in reversed order
            # this is due to the dbwarehouse crossblastp files
            # blast.B_x_A.symmetrized.gz isa symbolic link to
            # blast.A_x_B.symmetrized.gz if B > A (in string order)
            ordered_tags = [ tagA, tagB ]
            ordered_tags.sort()
            if [ tagA, tagB ] != ordered_tags:
                # swap tagA & tagB 
                tagA,tagB = tagB,tagA


            if self.SEARCH_METHOD == 'HOMOLOGS':
                if self.ALLOW_PARALOGS:
                    pass
                else:
                    if tagA == tagB:
                        continue
                    if tagA in seentags and tagB in seentags:
                        continue
            elif self.SEARCH_METHOD == 'BDBH':
                if tagA == tagB and self.ALLOW_PARALOGS:
                    if tagA in [ tup[0] for tup in self._data ]:
                        continue # there is already a fine hit gathered
                    else:
                        pass
                else:
                    if tagA == tagB and not self.ALLOW_PARALOGS:
                        continue
                    if tagA in seentags and tagB in seentags:
                        continue


            elif self.SEARCH_METHOD == 'SAFEORTHOLOGS':
                if tagA == tagB:
                    # check if there is not a paralog in the identifier's species it self
                    # that is to close nearby this identifier (a hypothetical paralogue)
                    ratioA, ratioB = float(ratioA), float(ratioB)
                    if max([ratioA,ratioB]) > self.SAFEORTHOLOGS_RATIO:
                        # there is in its own genome a hypothetical paralogue!
                        # empty data and break out!
                        self._data = []
                        break
                    else:
                        continue

                elif tagA in seentags and tagB in seentags:
                    if idA == identifier:
                        ratio = float(ratioA)
                        thetag = tagB
                    else:
                        ratio = float(ratioB)
                        thetag = tagA
                    maxratio = self._getfromdata(self._data,thetag)[5]
                    if min([ratio/maxratio, maxratio/ratio]) > self.SAFEORTHOLOGS_RATIO:
                        # remove this tag from data -> ortholog assignment is not 100% shure!
                        self._removefromdata(self._data,thetag)
                        ignoretags.append(thetag)
                        continue
                    else:
                        continue
                else:
                    pass

            else:
                # mode similarity -> all hits are okay
                pass


            # append tags to seentags
            if tagA not in seentags: seentags.append(tagA)
            if tagB not in seentags: seentags.append(tagB)

            # if here, a similar protein is mined!
            # gather locusdir and similarity data
            bitscore = int(float(bitscore))
            overlapA = float(overlapA)
            overlapB = float(overlapB)
            ratioA   = float(ratioA)
            ratioB   = float(ratioB)
            #if idA == identifier:
            #    self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ))
            #    ###print line.strip()
            #    ###print "A", self._data[-1],"\n"
            #else:
            #    self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA ))
            #    ###print line.strip()
            #    ###print "B", self._data[-1],"\n"

            if idA == identifier:
                self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ))
            elif idB == identifier:
                self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA ))
            elif idA.find(identifier) == 0:
                self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ))
            elif idB.find(identifier) == 0:
                self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA ))
            else:
                print "WHAT ELSE!?::", tagA,tagB,idA,idB,bitscore, overlapA, overlapB, ratioA, ratioB



        # remove the TEMPORARILY element in mode SAFEORTHOLOGS
        if self.SEARCH_METHOD == 'SAFEORTHOLOGS':
            self._removefromdata(self._data,genomedirtag)

        # order _data on bitscore
        tmpdata = []
        for item in self._data:
           tmpdata.append( ( item[2], item ) )
        tmpdata.sort()
        tmpdata.reverse()
        self._data = [ item for (s,item) in tmpdata ]

        print len(self._data), self.maximal_num_loci

        # remove _data elements when self.maximal_num_loci is exceeded
        if len(self._data) > self.maximal_num_loci -1:
            if (self.verbose and verbose==None) or verbose:
                # print the removed loci to screen
                print "# removed loci (%s): --maximal_num_loci (%s) exceeded" % (
                    len(self._data)-self.maximal_num_loci+1, self.maximal_num_loci )
                for tup in self._data[self.maximal_num_loci-1:]:
                    row = list( tup )
                    row.insert(0,genomedirtag)
                    row.insert(2,identifier)
                    print "\t".join([ str(elem) for elem in row ])
            # now actually remove the rows from _data
            # minus 1 is for the --identifier locus itself
            self._data = self._data[0:self.maximal_num_loci-1]


        # get the loci belonging to the mined similar proteins
        for ( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ) in self._data:
            tagBgenomedir = os.path.join(self.dbwarehouse_path,tagB)
            locusdir = self.identifier2locusdir(idB,genomedir=tagBgenomedir)
            if not locusdir: print "HEROOO...."
            self._loci.append( locusdir )

        # add genomedirtag and identifier to _data rows
        for i in range(0,len(self._data)):
            row = list( self._data[i] )
            row.insert(0,genomedirtag)
            row.insert(2,identifier)
            self._data[i] = tuple(row)

        if (self.verbose and verbose==None) or verbose:
            # print the results!
            print "# main (1th) and mined loci"
            for locus in self._loci:
                print locus
            print "# similarity data"
            #for ( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ) in self._data:
            #    print "\t".join([ str(elem) for elem in [genomedirtag, tagB, identifier, idB, bitscore, overlapA, overlapB, ratioA, ratioB ]])
            for row in self._data:
                print "\t".join([ str(elem) for elem in row ])
            print "# settings/options"
            print "seentags:  ", seentags
            print "ignoretags:", ignoretags
            print "use:       ", self.genometags_to_use
            print "ignore:    ", self.genometags_to_ignore
            print "# timing/performace"
            print stw.lap()

        return self._loci, self._data

Пример #6

Показать файл

Файл: cbgjunction2blastp.py Проект: IanReid/ABFGP

def blastanalysescbgjunction(
        gsg,
        prevCBG,
        nextCBG,
        omit_cbg_orfs=False,
        omit_non_cbg_orfs=False,
        extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
        omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
        verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org, orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf
        for org, orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,
            prevCBG,
            nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
        ).split("\n"))

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org, orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org, orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
                del (fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del (fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):", len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return []

    # check if all organisms are still covered
    orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return []

    # create !single! fasta database
    fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa"
    writeMultiFasta(fastadbmfa, fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ, orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ + "_orf_" + str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       protseq,
                                       fastadbname,
                                       extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS, _orfSid = alignment.title.replace(">",
                                                         "").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS, int(_orfSid))
                orfS = orfs[nodeS]

                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return []

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print[p.bitscore for p in dpcpacbpcol.pacbps.values()]
        print "PCG nodes:", dpcpacbpcol.get_ordered_nodes()
    ################################################################

    #### do some transformations on the pacbpcol
    ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1)
    ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
    ####        edges=gsg.node_count()-1 , max_missing_edges=0 )
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList:
        cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)
        for newcbg in cbgList:
            print "new:", newcbg
    ####################################################################

    # return list with CBGs
    return cbgList.codingblockgraphs

Пример #7

Показать файл

Файл: lib_cexpander.py Проект: IanReid/ABFGP

def cexpander2multiplealignment(cxpdr, verbose=False):
    """
    This function and its application are still under development. In future
    version, this cexpander obtained data will replace the (deprecated) PAOC
    and PASC VISTA-like tracks which were far to computationally expensive to
    obtain.
    """
    ########################################################################
    if verbose:
        stw = StopWatch(name="cxpdr2multiplealignment")
        stw.start()
    ########################################################################

    # for each of the _transferblocks (1 for each organism/gene), the
    # binarystring **should** contain an identical number of 1's
    # in freak-accident cases (1 in hundreds of thousand of cases),
    # it is observed that this not the case. Catch this exception here
    # before it hard-crashes with a raise somewhere later in this function
    if len(Set([trf.binarystring.count("1")
                for trf in cxpdr._transferblocks])) > 1:
        print "WARNING: unequal Cexpander.transferblocks.binarystring 1's count:",
        print Set(
            [trf.binarystring.count("1") for trf in cxpdr._transferblocks])
        return False

    # split the cexpander binarystrings on character changes 0->1 and 1->0
    substrings = {}
    orgs = [trf.header for trf in cxpdr._transferblocks]
    for ipos in range(0, len(orgs)):
        org = orgs[ipos]
        trf = cxpdr._transferblocks[ipos]
        substrings[org] = [
            x.group() for x in re.finditer("(1+|0+)", trf.binarystring)
        ]

    # maximum number of blocks in the cexpander output
    # WARNING TODO THIS IS STILL NOT 100% SAFE!!
    try:
        maxblocks = max(
            Set([len(substrings[org]) for org in substrings.keys()]))
    except:
        print "ERROR in cexpander2multiplealignment"
        print substrings.keys()
        print "inputseqs:", len(cxpdr.sequences)
        for k, v in substrings.iteritems():
            print k, len(v)
            print v
        # now raise the error...
        maxblocks = max(
            Set([len(substrings[org]) for org in substrings.keys()]))

    curblock = 0
    ########################################################################
    if verbose:
        print "maxblocks:", maxblocks,
        print[len(substrings[org]) for org in substrings.keys()]
        if len(Set([len(substrings[org]) for org in substrings.keys()])) > 1:
            for ipos in range(0, len(orgs)):
                org = orgs[ipos]
                print org,
                print[
                    Set(substrings[org][block])
                    for block in range(0, len(substrings[org]))
                ]
                trf = cxpdr._transferblocks[ipos]
                print trf.binarystring, len(trf.binarystring),
                print trf.binarystring.count("1"), trf.binarystring.count("0")
    ########################################################################
    while curblock < maxblocks:
        try:
            # create curblocktypeset
            curblocktypeset = Set("".join(
                [substrings[org][curblock] for org in substrings.keys()]))
        except IndexError:
            # substrings[org][curblock](s) IndexError
            # can happen on EOF blocks if some have zeros, others have nothing
            # append empty block; this will be dealth with in the
            # curblocktypeset Set("0")
            for org in substrings.keys():
                if len(substrings[org]) == curblock:
                    substrings[org].append("")
            # recreate curblocktypeset in 2th instance
            curblocktypeset = Set("".join(
                [substrings[org][curblock] for org in substrings.keys()]))

        ########################################################################
        if verbose:
            print "curiter::", curblock, maxblocks,
            print[len(substrings[org][curblock]) for org in substrings.keys()]
        ########################################################################

        if curblocktypeset == Set("1"):
            # block of just ones; settle this block by limiting on minimal length
            # of all organisms of 111-string.
            curblocklengths = Set(
                [len(substrings[org][curblock]) for org in substrings.keys()])
            if len(curblocklengths) == 1:
                pass  # all normal...
            else:
                minlength = min(curblocklengths)
                for org in substrings.keys():
                    if len(substrings[org][curblock]) > minlength:
                        blocklen = len(substrings[org][curblock])
                        substrings[org][curblock] = substrings[org][curblock][
                            0:minlength]
                        substrings[org].insert(curblock + 1,
                                               "1" * (blocklen - minlength))
                        substrings[org].insert(curblock + 1, "")
                # increase maxblocks counter
                maxblocks = max(
                    Set([len(substrings[org]) for org in substrings.keys()]))
                ####################################################################
                if verbose:
                    print "TRBLOCKS CHANGED!, curblock, maxblocks:", curblock, maxblocks,
                    print[len(substrings[org]) for org in substrings.keys()]
                    for ipos in range(0, len(orgs)):
                        org = orgs[ipos]
                        print org,
                        print[
                            Set(substrings[org][block])
                            for block in range(0, len(substrings[org]))
                        ]
                ####################################################################
        elif curblocktypeset == Set("0"):
            # check lengths of the blocks
            lengths = [
                len(substrings[org][curblock]) for org in substrings.keys()
            ]
            for org in substrings.keys():
                if len(substrings[org][curblock]) != max(lengths):
                    substrings[org][curblock] += "." * (
                        max(lengths) - len(substrings[org][curblock]))
        elif curblocktypeset == Set(["0", "1"]):
            # situation where frontal or intermediate zeros complicate the multiplealignment
            for org in substrings.keys():
                if Set(substrings[org][curblock]) == Set(['1']):
                    substrings[org].insert(curblock, "")
            # next, do as if cublocktypeset == Set("0") (which it is now!
            # check lengths of the blocks
            lengths = [
                len(substrings[org][curblock]) for org in substrings.keys()
            ]
            for org in substrings.keys():
                if len(substrings[org][curblock]) != max(lengths):
                    substrings[org][curblock] += "." * (
                        max(lengths) - len(substrings[org][curblock]))
        else:
            print "MIXED!!", curblocktypeset, "curblock:", curblock, "maxblocks:", maxblocks
            print "ERROR WILL LIKELY OCCUR QUICKLY AFTER HERE..."
            pass
            import sys
            sys.exit()

        # increase the blocks counter
        curblock += 1
    ########################################################################
    if verbose:
        for org in substrings.keys():
            # print the sequence itself
            for block in range(0, maxblocks):
                offset = sum([
                    substrings[org][i].count("1") +
                    substrings[org][i].count("0") for i in range(0, block)
                ])
                blocklen = len(substrings[org][block])
                if Set(substrings[org][block]) == Set("1"):
                    print cxpdr.sequences[org][offset:offset +
                                               blocklen].upper(),
                elif Set(substrings[org][block]) == Set("0"):
                    print cxpdr.sequences[org][offset:offset +
                                               blocklen].lower(),
                else:
                    gaps = substrings[org][block].count(".")
                    nongaps = blocklen - gaps
                    print cxpdr.sequences[org][offset:offset +
                                               nongaps].lower() + "-" * gaps,
            print org
            for block in range(0, maxblocks):
                print substrings[org][block],
            print org
    ########################################################################
    if verbose:
        for block in range(0, maxblocks):
            if substrings[substrings.keys()[0]][block].count("1") > 0: continue
            for org in substrings.keys():
                offset = sum([
                    substrings[org][i].count("1") +
                    substrings[org][i].count("0") for i in range(0, block)
                ])
                blocklen = len(substrings[org][block])
                if Set(substrings[org][block]) == Set("1"):
                    print cxpdr.sequences[org][offset:offset +
                                               blocklen].upper(),
                elif Set(substrings[org][block]) == Set("0"):
                    print cxpdr.sequences[org][offset:offset +
                                               blocklen].lower(),
                else:
                    gaps = substrings[org][block].count(".")
                    nongaps = blocklen - gaps
                    print cxpdr.sequences[org][offset:offset +
                                               nongaps].lower() + "-" * gaps,
                print substrings[org][block],
                print org
    ########################################################################
    if verbose:
        for org in substrings.keys():
            print org, "\t",
            for block in range(0, maxblocks):
                print len(substrings[org][block]),
                if substrings[org][block].count("1") == 0:
                    print "(%s,%s)" % (substrings[org][block].count('0'),
                                       substrings[org][block].count('.')),
            print "\t\t", sum(
                [len(substrings[org][block]) for block in range(0, maxblocks)])
        print stw.lap()
    ########################################################################
    return substrings

Пример #8

Показать файл

Файл: cbgjunction2blastp.py Проект: IanReid/ABFGP

def blastanalysescbgjunction(gsg,prevCBG,nextCBG,
    omit_cbg_orfs = False,
    omit_non_cbg_orfs = False,
    extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
    omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
    verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org,orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf
        for org,orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,prevCBG,nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
            ).split("\n")
        )

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org,orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org,orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
               del(fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del(fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):",len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return [] 

    # check if all organisms are still covered
    orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return [] 

    # create !single! fasta database
    fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa"
    writeMultiFasta(fastadbmfa,fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ,orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ+"_orf_"+str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname,
                    extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS,_orfSid = alignment.title.replace(">","").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS,int(_orfSid))
                orfS  = orfs[nodeS]
               
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = ( orgQ, orfQ.id )
                nodeS = ( orgS, orfS.id )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return [] 

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]

Пример #9

Показать файл

    def _recrute_pacbporfs_from_parental_cbg(self,
                                             parentcbg,
                                             create_cache=True,
                                             ignore_nonexisting_edges=False,
                                             verbose=False):
        """
	Harvest PacbPORFs from (parental) CodingBlockGraph

	@attention: alternative for harvest_pacbps_from_crossdata()
        @attention: required in _place_cbg_in_partialgsg() function
        @attention: use create_cache=False with care!

	@type  parentcbg: CodingBlockGraph
	@param parentcbg: CodingBlockGraph that has to delived PacbPORFs

	@type  create_cache: Boolean
	@param create_cache: run the create_cache() function on the CBG (self)

	@type  ignore_nonexisting_edges: Boolean
	@param ignore_nonexisting_edges: when False, do not create edges in the
					 CBG (self) that are absent (but present
					 in the parentcbg)

        @type  verbose: Boolean
        @param verbose: print debugging information to STDOUT when True
        """
        replacements = {}
        substituted = 0

        ####################################################################
        if verbose:
            stw = StopWatch("recruteParentalPacbps")
            print stw.start()
            print "target:", self
            print "source:", parentcbg
        ####################################################################

        for (node1, node2) in self.pairwisecrosscombinations_node():
            # if this edge is not present in the parent, ignore it
            if not parentcbg.has_edge(node1, node2): continue
            # get PacbPORF of the parent
            origpacbporf = parentcbg.get_pacbps_by_nodes(node1=node1,
                                                         node2=node2)[0]
            curpacbporf = None
            replace_pacbporf = False
            if not self.has_edge(node1, node2):
                if ignore_nonexisting_edges:
                    # if ignore_nonexisting_edges -> do not recrute this pacbp
                    continue
                else:
                    # replace this Pacbporf if it exists and
                    # simultaniously create novel edge
                    replace_pacbporf = True
            elif self.has_edge(node1,node2) and not\
     self.get_pacbps_by_nodes(node1=node1,node2=node2):
                replace_pacbporf = True
            else:
                curpacbporf = self.get_pacbps_by_nodes(node1=node1,
                                                       node2=node2)[0]
                if pacb.comparison.IsIdenticalPacbPORF(origpacbporf,
                                                       curpacbporf):
                    # Pacbporfs are already identical; not relevant to copy
                    continue
                if origpacbporf.issuperset(curpacbporf):
                    # store to replacements dict
                    replacements[(node1, node2)] = curpacbporf
                    # remove from the CBG -> replacement in progress
                    self.remove_pacbp(curpacbporf, node1, node2)
                    replace_pacbporf = True

            # check if replace_pacbporf is set to True
            if replace_pacbporf:
                ################################################################
                if verbose:
                    print stw.lap(), "REPLACING PacbPORF Source->Target:"
                    print "T:", curpacbporf, "(current)"
                    print "S:", origpacbporf
                    origpacbporf.print_protein(_linesize=100)
                ################################################################
                newkey = origpacbporf.construct_unique_key(node1, node2)
                self.set_edge_weight(node1, node2, wt=origpacbporf.bitscore)
                self.pacbps[(newkey, node1, node2)] = origpacbporf
                substituted += 1

        # check if substitutions have been taken place
        if create_cache and substituted:
            #####################################################################
            if verbose:
                print stw.lap(), "CREATE_CACHE & substituted PacbPORFS:",
                print substituted, "edges:", len(self.weights) / 2,
                print "pacbps:", len(self.pacbps)
                ####for k,pacbporf in self.pacbps.iteritems():
                ####    print k,"\n",pacbporf
            #####################################################################
            self.clear_cache()
            # check if there is an OMSR upon recreation; in very
            # exceptional cases, OMSR can get lost in this step
            if self.has_overall_minimal_spanning_range():
                self.create_cache()
                self.update_edge_weights_by_minimal_spanning_range()
            else:
                #############################################################
                if verbose:
                    print stw.lap(), "OMSR got lost!",
                    print "replacements:", len(replacements)
                    for (n1, n2), curpacbporf in replacements.iteritems():
                        print "REP:", curpacbporf, n1, n2
                #############################################################
                # OMSR got lost! Restore replacements dict and as such
                # restore the original PacbPs one by one (in random order)
                # and quit as soon as an OMSR is restored
                for (node1, node2), curpacbporf in replacements.iteritems():
                    newkey = curpacbporf.construct_unique_key(node1, node2)
                    tobereplpacbporf = self.get_pacbps_by_nodes(node1=node1,
                                                                node2=node2)[0]
                    # remove from the CBG
                    self.remove_pacbp(tobereplpacbporf, node1, node2)
                    # and place back the original one
                    self.set_edge_weight(node1, node2, wt=curpacbporf.bitscore)
                    self.pacbps[(newkey, node1, node2)] = curpacbporf
                    substituted -= 1
                    if self.has_overall_minimal_spanning_range():
                        self.create_cache()
                        self.update_edge_weights_by_minimal_spanning_range()
                        #########################################################
                        if verbose:
                            print stw.lap(), "OMSR restored, substitutions:",
                            print substituted
                            print "T:", self
                        ##########################################################
                        # break out of the for loop of PacbP replacement
                        break

        # return number of replaced/added pacbporfs
        return substituted