def harvest_elegiable_acceptor_sites(self,projected_acceptors={},forced_codingblock_ends={},prev=None, store_all_projected_sites=False, allow_phase_shift=False, enlarge_5p_boundary_by=None, # in AA coordinates enlarge_3p_boundary_by=None, # in AA coordinates ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE=None, MIN_ACCEPTOR_PSSM_SCORE=None,ALLOW_NON_CANONICAL_ACCEPTOR=None, NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE=None ): """ Harvest elegiable acceptor sites from this CodingBlockGraph into a AcceptorSiteCollectionGraph """ if prev and prev.__class__.__name__ not in ["CodingBlockGraph","LowSimilarityRegionCodingBlockGraph"]: message = "prev must be a CodingBlock graph object, not a %s" % prev.__class__.__name__ raise InproperlyAppliedArgument, message # update minimal pssm score to stg collection object stg = AcceptorSiteCollectionGraph() stg.MIN_PSSM_SCORE = MIN_ACCEPTOR_PSSM_SCORE stg.ALIGNED_SITE_AA_OFFSET = ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE # First, proces each individual organism. # (A) obtain elegiable splice site range # (B) scan for splice sites # (C) add the projected sites to the graph # (D) add splice sites to the stg collection graph for org in self.organism_set(): # take the first (and only) orf of this organism theorf = self.get_orfs_of_graph(organism=org)[0] if forced_codingblock_ends.has_key(org): # the node that represents this site cbgSta = forced_codingblock_ends[org] cbgStaNode = ( org,theorf.id,cbgSta.pos ) # add to the collection graph stg.add_node_and_object(cbgStaNode,cbgSta) # ready with this organism, no splice site setting! #continue if prev.__class__.__name__ == "LowSimilarityRegionCodingBlockGraph": # continue; all `acceptor` boundaries are hard-set # no splice_site_range or actual site prediction needed continue ######################################################################## ### get the considered splice site range ######################################################################## (max_aa_pos, max_nt_pos) = self.maximal_eligable_acceptor_site_position(org) (min_aa_pos, min_nt_pos) = (theorf.startPY-2)/3, theorf.startPY-2 if prev and org in prev.organism_set(): (next_min_aa_pos, next_min_nt_pos) = self.minimal_eligable_acceptor_site_position(org,prevcbg=prev) else: (next_min_aa_pos, next_min_nt_pos) = self.minimal_eligable_acceptor_site_position(org) if next_min_nt_pos > min_nt_pos: # minimal range falls within the orf's start point (min_aa_pos, min_nt_pos) = (next_min_aa_pos, next_min_nt_pos) if enlarge_5p_boundary_by: min_aa_pos = min_aa_pos - enlarge_5p_boundary_by min_nt_pos = min_nt_pos - (enlarge_5p_boundary_by*3) if enlarge_3p_boundary_by: max_aa_pos = max_aa_pos + enlarge_3p_boundary_by max_nt_pos = max_nt_pos + (enlarge_3p_boundary_by*3) # set range to stg Collection objects stg.set_consideredsplicesiterange(org,min_nt_pos,max_nt_pos) if forced_codingblock_ends.has_key(org): # ready with this organism, no splice site setting! continue ######################################################################## ### obtain splice sites for current collection ######################################################################## # scan for splice sites theorf.scan_orf_for_pssm_splice_sites(splicetype="acceptor", min_pssm_score=MIN_ACCEPTOR_PSSM_SCORE,allow_non_canonical=ALLOW_NON_CANONICAL_ACCEPTOR, non_canonical_min_pssm_score=NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE) # first, add the projected splicesites (they overrule true sites) if projected_acceptors.has_key(org): for projsite in projected_acceptors[org]: # check if we can ignore this site if not store_all_projected_sites: if projsite.pos < min_nt_pos: continue if max_nt_pos and projsite.pos > max_nt_pos: continue # create and add this projected site! projNode = ( org,theorf.id,projsite.pos ) stg.add_node_and_object(projNode,projsite) # add the splice sites to the graph for asq in theorf._acceptor_sites: # check if we can ignore this site if asq.pos < min_nt_pos: continue if max_nt_pos and asq.pos > max_nt_pos: continue # the node that represents this site asqNode = ( org,theorf.id,asq.pos ) # check if this splice site is not already added as a projected site if asqNode not in stg.get_nodes(): stg.add_node_and_object(asqNode,asq) # now loop over all aligned combinations of organisms for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems(): # only proces this combination if both organisms have splice sites! if g1 not in stg.organism_set(): continue if g2 not in stg.organism_set(): continue # now loop over all acceptor sites in Query and Sbjct # and align them in a graph; an edge is added if 2 sites # are less then ``ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE*3`` apart from each other for asq in stg.get_organism_objects(g1): # the node that represents this site asqNode = ( g1,o1,asq.pos ) asqClass = asq.__class__.__name__ for ass in stg.get_organism_objects(g2): # the node that represents this site assNode = ( g2,o2,ass.pos ) assClass = ass.__class__.__name__ if 'CodingBlockStart' in [ asqClass,assClass ]: if asqClass == assClass: # both CodingBlockEnd objects dist = 0 else: # calculate the distance in aligned nt positions dist = pacbporf.get_distance_aligned_nucleotide_positions( query = asq.pos, sbjct = ass.pos ) # check for the distance constrain if dist > ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE*3: continue else: # Both Acceptor sites; check for phase compatibility if not allow_phase_shift and asq.phase != ass.phase: continue # calculate the distance in aligned nt positions dist = pacbporf.get_distance_aligned_nucleotide_positions( query = asq.pos, sbjct = ass.pos ) if asq.phase == ass.phase: # ignore uniformly aligned sites here pass elif allow_phase_shift and dist <= MAX_SPLICE_SITE_PHASE_SHIFT_NT_DISTANCE and\ asq.phase != ass.phase and min([ asq.pssm_score, ass.pssm_score ]) >= MIN_ACCEP_SITE_PHASE_SHIFT_PSSM_SCORE: #print "PhaseShift:", dist, (g1,asq.pos), (g2,ass.pos), min([ asq.pssm_score, ass.pssm_score ]) pass # a potential splice site phase shift else: continue # check for the distance constrain for sites of uniform phase if dist > ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE*3: continue # calculate binary entropies from Query if asqClass == 'SpliceAcceptor': asqPositionPos, phaseQ = pacbporf.dnaposition_query(asq.pos,forced_return=True) entropyQ = pacbporf.alignment_entropy(asqPositionPos,method='acceptor') elif asqClass == 'ProjectedSpliceAcceptor': entropyQ = asq.entropy elif asqClass == 'CodingBlockStart': entropyQ = 1.0 else: raise "NOT a SpliceAcceptor or a ProjectedSpliceAcceptor" # calculate binary entropies from Sbjct if assClass == 'SpliceAcceptor': assPositionPos, phaseS = pacbporf.dnaposition_query(ass.pos,forced_return=True) entropyS = pacbporf.alignment_entropy(assPositionPos,method='acceptor') elif assClass == 'ProjectedSpliceAcceptor': entropyS = ass.entropy elif assClass == 'CodingBlockStart': entropyS = 1.0 else: raise "NOT a SpliceAcceptor or a ProjectedSpliceAcceptor" # if here, then we have an aligned splice site! # calculate weight from distance, add edge and binary entropy values wt = 1.0 / ( 1.0 + float(dist/3) ) stg.add_edge(asqNode,assNode,wt=wt) stg._edge_binary_entropies[(asqNode,assNode)] = (entropyQ,entropyS) stg._edge_binary_entropies[(assNode,asqNode)] = (entropyS,entropyQ) # return filled splicesitecollection graph return stg
def ExonCollectionGraph2AcceptorSiteCollectionGraph(gra): """ Convert ECG -> AcceptorSiteCollectionGraph @attention: only in use when ECG is NOT a FirstExon ECG @rtype: AcceptorSiteCollectionGraph @return: AcceptorSiteCollectionGraph instance to be placed in the CBG """ newgra = AcceptorSiteCollectionGraph() newgra.ALIGNED_SITE_AA_OFFSET = 10 newgra.MIN_PSSM_SCORE = -0.0 for node in gra.get_nodes(): accep = gra._node_object[node].acceptor if accep.__class__.__name__ == 'CodingBlockStart': phase = gra.acceptor_phase() # return a ProjectedSpliceSite projAccep = CodingBlockStart2ProjectedSpliceAcceptor(accep,phase=phase) newnode = ( node[0], node[1], projAccep.pos ) newgra.add_node_and_object(newnode,projAccep) newgra._node_pssm[newnode] = accep.pssm_score else: newnode = ( node[0], node[1], accep.pos ) newgra.add_node_and_object(newnode,accep) newgra._node_pssm[newnode] = accep.pssm_score for nodeA,nodeB in newgra.pairwisecrosscombinations_node(): entropyQ = 1.0 entropyS = 1.0 newgra.add_edge(nodeA,nodeB,wt=1.0) newgra._edge_binary_entropies[(nodeA,nodeB)] = (entropyQ,entropyS) newgra._edge_binary_entropies[(nodeB,nodeA)] = (entropyS,entropyQ) return newgra