Пример #1
0
    def get_pacbps_by_nodes(self, node1=None, node2=None, order_by='bits'):
        """
        Get the pacbp(s) from the CodingBlockGraph by node, optional two nodes

        @type  node1: *
        @param node1: node identifier

        @type  node2: *
        @param node2: node identifier (or None)

        @type  order_by: string
        @param order_by: 'length' (DESC), 'bits' (DESC), 'start' (ASC);
			 default 'bits'

        @rtype:  list
        @return: list of pacbps objects

        @attention: when only a single node is requested, pacbp(s) are swapped
		    to make the requested node the query node
        """
        if node1 not in self.get_nodes():
            message = "node1 `%s` not in graph: %s" % (node1, self.get_nodes())
            raise InproperlyAppliedArgument, message
        if node2 and node2 not in self.get_nodes():
            message = "node2 `%s` not in graph: %s" % (node2, self.get_nodes())
            raise InproperlyAppliedArgument, message
        if order_by not in ['bits', 'length']:
            order_by = 'bits'

        # if no pacbps are stored into the object yet, return []
        if not self.pacbps: return []

        thepacbps = []
        for (key, nodeA, nodeB), pacbporf in self.pacbps.iteritems():
            if nodeA == node1 or nodeB == node1:
                if not node2:
                    if nodeA == node1:
                        thepacbps.append(pacbporf)
                    else:
                        # swap query and sbjct!
                        thispacbporf = pacb.swap_query_and_sbjct(pacbporf)
                        thepacbps.append(thispacbporf)
                else:
                    if nodeA == node2 or nodeB == node2:
                        thepacbps.append(pacbporf)
                    else:
                        pass

        # order the requested pacbps
        if order_by == 'bits':
            thepacbps = ordering.order_list_by_attribute(thepacbps,
                                                         "bits",
                                                         reversed=True)
        else:
            thepacbps = ordering.order_list_by_attribute(thepacbps,
                                                         "length",
                                                         reversed=True)

        # return the requested pacbps
        return thepacbps
Пример #2
0
    def get_pacbps_by_organism(self, organism, order_by=None):
        """
        Get the pacbp(s) from the CodingBlockGraph of a single organism

        @type  organism: * (string)
        @param organism: Organism identifier

        @type  order_by: string
        @param order_by: 'length' (DESC), 'bits' (DESC), 'node' or None
			 (on node); default None

	@rtype:  list
	@return: list of pacbps objects

        @attention: pacbps are swapped such that `organism` is always the query!
        @attention: pacbps are ordered by their sbjct nodes
        """
        # check if requested organism is present in this graph
        if organism not in self.organism_set():
            raise OrganismNotPresentInGraph

        # if no pacbps are stored into the object yet, return []
        if not self.pacbps: return []

        # reset order_by if falsely assigned
        if order_by not in [None, 'bits', 'length']:
            order_by = None

        thepacbps = []
        for (key, (org1, orf1), (org2,
                                 orf2)), pacbporf in self.pacbps.iteritems():
            if organism == org1:
                thepacbps.append(((org2, orf2), pacbporf))
            elif organism == org2:
                # swap query and sbjct!
                thispacbporf = pacb.swap_query_and_sbjct(pacbporf)
                thepacbps.append(((org1, orf1), thispacbporf))
            else:
                pass
        # sort the requested pacbps on Node
        thepacbps.sort()
        thepacbps = [pacbporf for node, pacbporf in thepacbps]

        # order the requested pacbps if requested for
        if order_by == 'bits':
            thepacbps = ordering.order_list_by_attribute(thepacbps,
                                                         "bits",
                                                         reversed=True)
        if order_by == 'length':
            thepacbps = ordering.order_list_by_attribute(thepacbps,
                                                         "length",
                                                         reversed=True)

        # return the requested pacbps
        return thepacbps
Пример #3
0
    def get_pacbps_by_organisms(self, orgA, orgB, order_by='bits'):
        """
        @type  orgA: * (string)
        @param orgA: Organism identifier

        @type  orgB: * (string)
        @param orgB: Organism identifier

        @type  order_by: string
        @param order_by: 'length' (DESC), 'bits' (DESC), 'start' (ASC);
                         default 'bits'

        @rtype:  list
        @return: list of pacbps objects
        """
        # check if requested organisms are present in this graph
        if orgA not in self.organism_set():
            raise OrganismNotPresentInGraph, orgA
        if orgB not in self.organism_set():
            raise OrganismNotPresentInGraph, orgB

        # if no pacbps are stored into the object yet, return empty list
        if not self.pacbps: return []

        # if no pacbps are stored into the object yet, return []
        if not self.pacbps: return []

        thepacbps = []
        for (key, (org1, orf1), (org2,
                                 orf2)), pacbporf in self.pacbps.iteritems():
            if orgA == org1 and orgB == org2:
                thepacbps.append(pacbporf)
            elif orgB == org1 and orgA == org2:
                thepacbps.append(pacbporf)
            else:
                continue

        # order the requested pacbps
        if order_by == 'bits':
            thepacbps = ordering.order_list_by_attribute(thepacbps,
                                                         "bits",
                                                         reversed=True)
        else:
            thepacbps = ordering.order_list_by_attribute(thepacbps,
                                                         "length",
                                                         reversed=True)

        # return the requested pacbps
        return thepacbps
Пример #4
0
    def get_optimal_single_site(self,organism,bestalignedsite=None):
        """
        Get the optimal TranslationalStartSite from the collection

        @type  organism: *
        @param organism: Organism identifier

        @type  bestalignedsite: AlignedTranslationalStartSiteGraph 
        @param bestalignedsite: AlignedTranslationalStartSiteGraph instance or None

        @rtype:  TranslationalStartSite
        @return: optimal TranslationalStartSite instance 
        """
        # get the most frontal (5p) TSS; phase check is not needed (all phases==0)
        sites = ordering.order_list_by_attribute( self.get_organism_objects(organism), order_by='pos')

        # TODO: now just the most frontal site is chosen; not the site most nearby the OMSR,
        # or the most frontal site > a certain score. This might be implemented after studying
        # ample examples
        if bestalignedsite:
            if sites:
                return sites[0]
            else:
                return None
        else:
            if sites:
                return sites[0]
            else:
                return None
Пример #5
0
    def get_organism_objects(self,organism,order_by=''):
        """
        Get all the OBJECTS of a specific organism from the graph

        @type  organism: *
        @param organism: organism identifier (presumably a string)

        @rtype:  list
        @return: list of all OBJECTS from this organism in this graph
        """
        return ordering.order_list_by_attribute(
                [ self._node_object[node] for node in self.get_organism_nodes(organism) ],
                order_by=order_by
                )
Пример #6
0
    def get_organism_objects(self,organism,order_by=''):
        """
        Get all the OBJECTS of a specific organism from the graph

        @type  organism: *
        @param organism: organism identifier (presumably a string)

        @rtype:  list
        @return: list of all OBJECTS from this organism in this graph

        @attention: not recommended to use in this class, added for compatibily with other classes
        """
        return ordering.order_list_by_attribute(
                [ self._node_object[node] for node in self.get_organism_nodes(organism) ],
                order_by=order_by
                )
Пример #7
0
    def togff(self,organism=None,gff={}):
        """
        Create gff tuple for ALL objects in the collection of a specific organism

        @type  gff: dictionary
        @param gff: overwrite default gff data, keys: ('fstrand','fphase','fref',etc...)

        @type  organism: * (presumably string)
        @param organism: Organism identifier to make the gff for

        @rtype:  list of tuples
        @return: list of gff tuples with 9 elements
        """
        gffdata = []
        if organism in self.organism_set():
            for object in ordering.order_list_by_attribute( self.get_organism_objects(organism), order_by='pos'):
               gffdata.append( object.togff( gff=gff ) )
        return gffdata
Пример #8
0
    def construct_final_tiny_cbg(self,
        max_exon_nt_length=SHORT_TAILINGEXON_MAX_NT_LENGTH,
        max_intron_nt_length=SHORT_TAILINGEXON_MAX_INTRON_NT_LENGTH,
        take_max_best_acceptors=SHORT_TAILINGEXON_TAKE_MAX_BEST_ACCEPTORS,
        take_max_best_ecgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_ECGS,
        take_max_best_cbgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_CBGS,
        maximal_current_stopcodongraph_average_weight=0.90,
        minimal_last_vs_new_identity_ratio=0.80,
        maximal_cexpander_cbg_tail_uniformity_aa_length=3,
        elegiable_donor_omsr_nt_offset=21,
        verbose=False):
        """
        Make a tiny final CBG by ``shooting tiny exons into the deep``
        """
        # get current last CBG
        last = self.get_final_cbg()

        # check if final tail of this CBG is uniformaly alignable
        cxpdrOutput = cexpanderanalyses_omsr2orfend(last)
        IS_UNIFORMLY_ALIGNED = True
        for trf in cxpdrOutput._transferblocks:
            if trf.binarystring[-maximal_cexpander_cbg_tail_uniformity_aa_length:].count("0"):
                IS_UNIFORMLY_ALIGNED = False
                break

        ############################################################
        if verbose:
            print "Cexpander uniformaly aligned:",
            print maximal_cexpander_cbg_tail_uniformity_aa_length,
            print "->", IS_UNIFORMLY_ALIGNED
            print "omsr:       ", last._cexpander.projected_on,
            print last._cexpander.binarystring
            trf = cxpdrOutput.get_transfer_of_projected_on(
                    last._cexpander.projected_on)
            if trf and trf != True:
                print "omsr2orfend:", last._cexpander.projected_on,
                print trf.binarystring
        ############################################################

        if IS_UNIFORMLY_ALIGNED:
            # break out of this function. Chance of overpredicting
            # a final tiny exon is bigger then finding a True one!
            return False

        # check if the stopcodongraph is not (very) good already
        if last._stopcodongraph.average_weight() >=\
        maximal_current_stopcodongraph_average_weight:
            # break out of this function. Chance of overpredicting
            # a final tiny exon is bigger then finding a True existing one
            return False

        # start the timer (performance benchmark in verbose mode)
        stw = StopWatch(name='stwFinalECG')
        stw.start()

        # get FinalExons on elegiable Orfs based on distance towards OMSR of
        # current last CBG and minimal acceptor site score
        omsr  = last.overall_minimal_spanning_range()
        maxsr = last.maximal_spanning_range()
        ECG = ExonCollectionGraph()

        ################################################################
        if verbose:
            print "currentLAST", last
            print last._stopcodongraph
            print last._stopcodongraph.is_optimal()
            for org in last.organism_set():
                print org, last._stopcodongraph.is_optimal(organism=org)
            for organism in last.organism_set():
                node = last.node_by_organism(organism)
                theorf = last.get_orfs_of_graph(organism=organism)[0]
                print organism, "\t", node, "\t", max(omsr[node]), "\t",
                print max(maxsr[node]), theorf.endPY/3
        ################################################################

        for organism in last.organism_set():
            node = last.node_by_organism(organism)
            # calculate an offset for the acceptor position
            # variable elegiable_acceptor_omsr_nt_offset is needed to
            # enlarge the OMSR definded offset. When the OMSR is by chance
            # a few nt or aa larger than the actual exon length, the true
            # acceptor position can be erroneously abandoned.
            offset = max(omsr[node]) * 3 - elegiable_donor_omsr_nt_offset 
            theorf = last.get_orfs_of_graph(organism=organism)[0]

            # check if this final orf is self can serve as a final extension
            remaining_orf_nt_length          = (theorf.protein_endPY - max(omsr[node])) * 3
            remaining_maxsr_nt_length        = (max(maxsr[node]) - max(omsr[node])) * 3
            remaining_maxsr_tostop_nt_length = (theorf.protein_endPY - max(maxsr[node])) * 3 


            FIND_NEW_FINAL_ORFS       = True
            STORE_CURRENT_ORF_AS_FIOO = False 
            if remaining_maxsr_nt_length >= max_exon_nt_length:
                # exceptionally large maxsr on rigth side of omsr
                # store as FIOO but to NOT search for an orf extension!
                ### FIND_NEW_FINAL_ORFS       = False # discarded 17/09/2009; when poos maxsr present, overruled!
                STORE_CURRENT_ORF_AS_FIOO = True
            elif remaining_maxsr_tostop_nt_length <= 18:
                # maxsr is less then 6 AA apart from stop on current orf
                #FIND_NEW_FINAL_ORFS       = False
                STORE_CURRENT_ORF_AS_FIOO = True
            elif remaining_orf_nt_length < max_exon_nt_length:
                # final piece of unaligned sequence is a perfect HMM seed
                STORE_CURRENT_ORF_AS_FIOO = True
            else:
                pass

            if STORE_CURRENT_ORF_AS_FIOO:
                cbs = CodingBlockStart( theorf.aapos2dnapos( max(omsr[node]) ) )
                # set pssm_score to (very) high; this rewards
                # using the current Orf as the last Orf
                cbs.pssm_score = 20.0
                fioo = FinalExonOnOrf(cbs,theorf.endPY,theorf)
                node = (organism,theorf.id,fioo.start,fioo.end)
                ECG.add_node_and_object(node,fioo)
                ################################################################
                if verbose:
                    print organism,theorf.id,"self==potential last exon", remaining_orf_nt_length
                    print organism, theorf.id, fioo, fioo.start,fioo.end, theorf.endPY
                ################################################################

            if not FIND_NEW_FINAL_ORFS:
                # quit here -> no orf extension of this CBG
                continue

            # get elegiable (new) final orfs
            orflist = self.input[organism]['orfs'].get_elegiable_orfs(
                    max_orf_start=offset+max_intron_nt_length,
                    min_orf_end=offset )
            ################################################################
            if verbose:
                print organism, [ orf.id for orf in orflist ], "offset:", offset, offset/3
            ################################################################
            for orf in orflist:
                results = find_tailing_exon_on_orf(
                        theorf,orf,
                        current_donor_pos=offset,
                        max_tailingexon_nt_length=max_exon_nt_length,
                        max_tailingexon_intron_nt_length=max_intron_nt_length,
                        )
                for exon,intron in results:
                    node = (organism,orf.id,exon.start,exon.end)
                    if node not in ECG.get_nodes():
                        ECG.add_node_and_object(node,exon)
                        if verbose: print organism, node, exon

        if verbose: print stw.lap(), "Exon objects gathered", ECG.node_count()

        # now take only the best `take_max_best_acceptors`
        # because there can be quite some of them!
        for organism in ECG.organism_set():
            objects = ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True )
            for obj in objects[take_max_best_acceptors:]:
                node = (organism,obj.orf.id,obj.start,obj.end)
                ECG.del_node(node)
                if verbose: print "deleted:", node, obj.orf.id, obj.pssm_score

        ########################################################################
        if verbose:
            print stw.lap(), ">take_max_best_acceptors DELETED"
            for organism in ECG.organism_set():
                for obj in ordering.order_list_by_attribute(
                    ECG.get_organism_objects(organism),
                    order_by='pssm_score', reversed=True
                    ):
                    print "remaining", organism, obj.orf.id, obj.length, obj
        ######################################################################## 

        # only continue if all organisms are represented in the ECG
        if last.organism_set_size() > ECG.organism_set_size():
            if verbose: print "To few organisms/genes present -> return False"
            return False

        # create edges in the ECG between compatible phases and 
        # exon length, then make pacbps for these edges
        ECG.create_edges()
        ECG.make_pacbps_for_edges()
        if verbose:
            print stw.lap(), "edges + PACBPS created:", ECG.edge_count(), ECG.node_count(), len(ECG.pacbps)

        # search for complete graphs in this
        last_exon_graphs = ECG.find_fully_connected_subgraphs()

        ########################################################################
        if verbose: 
            print stw.lap(), "duration of ECG.find_fully_connected_subgraphs()",
            print len(last_exon_graphs)
        ########################################################################

        # only continue if there is an perfectly aligned last exon graph
        if not (last_exon_graphs and last_exon_graphs[0].connectivitysaturation() == 1.0):
            ####################################################################
            if verbose: print "no perfect aligned last exon graph -> return False"
            ####################################################################
            return False

        # convert to CodingBlockGraphs
        new_last_cbgs = []
        for leg in last_exon_graphs[0:take_max_best_ecgs]:
            cbg = ExonCollectionGraph2CodingBlockGraph(leg,is_last=True,lastCBG=last)
            if cbg != False and cbg != None and cbg.organism_set_size() == last.organism_set_size():
                # create cache of CBG and do final check on quality
                cbg.create_cache()
                if (cbg.total_weight() < 0 or cbg.omsrlength() <= 10) and\
                cbg._cexpander.binarystring.find("1") == -1:
                    # discard hardly alignable CBGs
                    continue
                # if here, then append this cbg as a possible novel final CBG
                new_last_cbgs.append( cbg )
                ################################################################
                if verbose: print "LEGcbg", cbg
                ################################################################

        ########################################################################
        if verbose: print stw.lap(), "ECGs converted to CBGs", len(new_last_cbgs)
        ########################################################################

        if not new_last_cbgs:
            ####################################################################
            if verbose: print "no ecgs convertable to CBGs -> return False"
            ####################################################################
            return False

        # order by total weight, get the optimal CBG and its corresponding ECG
        new_last_cbgs = ordering.order_graphlist_by_total_weight(new_last_cbgs)
        theNewLastCbg = None
        cbgIF = None


        # check all interfaces between the novel final CBGs and the previous
        # CBG. The best interface is added to the GSG!
        cbgif_accepted_new_last_cbgs = []
        already_checked_node_sets = []

        for newcbg in new_last_cbgs[0:take_max_best_cbgs]:
            lastExonGraph = newcbg._ExonCollectionGraph
            del( newcbg._ExonCollectionGraph )

            # check if it is not the extention of the current
            # last CBG (identical nodes)
            if len(last.node_set().symmetric_difference(newcbg.node_set())) == 0:
                if verbose: print "newCBG is the extention of current last CBG!!"
                continue

            # check if this combination of nodes (orfs) has not been tried already
            if newcbg.get_ordered_nodes() in already_checked_node_sets:
                ###############################################################
                if verbose: 
                    print "newCBG node set done earlier:", 
                    print newcbg.get_ordered_nodes()
                ###############################################################
                continue
            else:
                # append this set of nodes (as a list) to checklist
                already_checked_node_sets.append( newcbg.get_ordered_nodes() )

            # check if this new final tinyexon graph has a compatible interface
            # with the current last one
            cbgIF = CodingBlockGraphInterface(last,newcbg)
            cbgIF.harvest_splice_sites()
            distinct_orgs = []
            for node in lastExonGraph.get_nodes():
                exon = lastExonGraph.get_node_object(node)
                if exon.acceptor.__class__.__name__ == 'SpliceAcceptor':
                    distinct_orgs.append( lastExonGraph.organism_by_node(node) )
            cbgIF.allow_intron_in_organisms(distinct_orgs)
            cbgIF.find_conserved_splice_sites()
            # do NOT optimize -> consumes a lot of time and is helpfull
            # only in extreme cases...
            #cbgIF.optimize()

            if not cbgIF.is_compatible():
                ################################################################
                if verbose:
                    print "newCBG not a is_compatible() cbgIF"
                    print newcbg
                ################################################################
                continue

            # append to cbgif_accepted_new_last_cbgs
            newcbg._CBGinterface5p = cbgIF
            cbgif_accepted_new_last_cbgs.append(
                    (
                        cbgIF.optimalitycheck().count(True),
                        newcbg.total_weight(),
                        newcbg
                    )
                )

        ########################################################################
        if verbose:
            print stw.lap(), "cbgIFs checked %s/%s" % (
                len(cbgif_accepted_new_last_cbgs),
                len(new_last_cbgs[0:take_max_best_cbgs])
                )
        ########################################################################
        # now start by adding the highest scoring newcbg first
        cbgif_accepted_new_last_cbgs.sort()
        cbgif_accepted_new_last_cbgs.reverse()

        ########################################################################
        if verbose:
            print "candidate novel final CBGs:", len(cbgif_accepted_new_last_cbgs)
            for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs:
                print true_cnt,totalwt,newcbg._CBGinterface5p
                print newcbg
        ########################################################################

        for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs:
            # get the already created cbgIF from the newcbg graph
            cbgIF = newcbg._CBGinterface5p
    
            # now check 4 criteria:
            # (1) cbgIF.is_optimal() (2) >GTG.identity
            # (3) >STG.totalweight   (4) <STG.distance
            criteria = []
            criteria.append( cbgIF.is_optimal() )
            criteria.append( newcbg._stopcodongraph.total_weight() > last._stopcodongraph.total_weight() )
            criteria.append( newcbg.genetree().identity() > last.genetree().identity() )
            criteria.append( newcbg._stopcodongraph.stopcodon2omsrdistance() <= last._stopcodongraph.stopcodon2omsrdistance() )

            ####################################################################
            if verbose:
                print "TRYING ADDITION of final newcbg", criteria
                print true_cnt,totalwt,newcbg._CBGinterface5p
                print newcbg
            ####################################################################

            # check if there is only a single different node/orf changed in the newcbg
            # this is recognized by a symmetric_difference of size 2 
            # in this case, be very strict! This easily causes overprediction (FP) tiny exons 
            if len(last.node_set().symmetric_difference(newcbg.node_set())) == 2:
                # check if 4 criteria are valid;
                # a single False results in not accepting this new last tiny cbg
                if False in criteria:
                    if verbose: print "# NOVEL lastTinyExon discarded; single orf extension, criteria", criteria
                    # continue -> no new tiny CBG
                    continue

            # now start check the criteria.
            # if criteria[0] == True, means a fully is_optimal interface!
            # do not perform any additional check, just add!
            if criteria[0] == True:
                theNewLastCbg = newcbg
                break
            
            # total weight criterion -> new.tw() > last.tw()
            if criteria[1] == False:
                ##########################################################################
                if verbose:
                    print "# NOVEL lastTinyExon discarded; to low total weight"
                    print "#", newcbg._stopcodongraph
                ##########################################################################
                # continue -> no new tiny CBG
                continue

            # identity criterion -> allow a ratio i.s.o. new.id() > last.id()
            # this strict criterion (>) is applied for single-new-orf-CBGs
            if criteria[2] == False:
                ratio = newcbg.genetree().identity() / last.genetree().identity()
                if ratio < minimal_last_vs_new_identity_ratio:
                    ######################################################################
                    if verbose:
                        print "# NOVEL lastTinyExon discarded; to low identity"
                        print "#", newcbg._stopcodongraph, newcbg.genetree().identity()
                    ######################################################################
                    # continue -> no new tiny CBG
                    continue
 
            if criteria[3] == False:
                ##########################################################################
                if verbose:
                    print "# NOVEL lastTinyExon discarded; higher stopcodon2omsrdistance"
                    print "#", newcbg._stopcodongraph
                ##########################################################################
                # continue -> no new tiny CBG
                continue
 
            # if this point is reached, a new tiny last CBG has been found!
            theNewLastCbg = newcbg
            # break out of the for loop; store into the genestructure
            break



        # all okay -> ready for inserting the new CBG
        if theNewLastCbg and verbose:
            ################################################################################
            print "NEW FINAL TINY EXON FOUND!!"
            print theNewLastCbg
            print cbgIF, cbgIF.is_optimal(), cbgIF.is_acceptable()
            print cbgIF._optimal_aligned_donor, cbgIF.donor_phase()
            print cbgIF._optimal_aligned_acceptor, cbgIF.acceptor_phase()
            ################################################################################

        # hard-insert into the genestructure
        # using add_codingblock is likely to cause problems
        # because of the tinyness of the CBG
        if theNewLastCbg:
            for pos in range(0,len(self)):
                if self.codingblockgraphs[pos].IS_IGNORED: continue
                if self.codingblockgraphs[pos].IS_LAST:
                    thelast = self.codingblockgraphs[pos]
                    thelast.IS_LAST = False
                    newcbg.IS_LAST  = True
                    self.codingblockgraphs.insert(pos+1,theNewLastCbg)
                    # set the CBGInterface object in next and prev CBG
                    self.codingblockgraphs[pos]._CBGinterface3p = cbgIF
                    self.codingblockgraphs[pos+1]._CBGinterface5p = cbgIF
                    # break out; end of this function
                    break

            # done! return a True because newcbg is created & inserted
            return True
        else:
            # no newLastCbg found
            return False