def ExonCollectionGraph2DonorSiteCollectionGraph(gra): """ Convert ECG -> DonorSiteCollectionGraph @attention: only in use when ECG is NOT a FinalExon ECG @rtype: DonorSiteCollectionGraph @return: DonorSiteCollectionGraph instance to be placed in the CBG """ newgra = DonorSiteCollectionGraph() newgra.ALIGNED_SITE_AA_OFFSET = 10 newgra.MIN_PSSM_SCORE = -0.0 for node in gra.get_nodes(): donor = gra._node_object[node].donor if donor.__class__.__name__ == 'CodingBlockEnd': phase = gra.donor_phase() # return a ProjectedSpliceSite projDonor = CodingBlockEnd2ProjectedSpliceDonor(donor,phase=phase) newnode = ( node[0], node[1], projDonor.pos ) newgra.add_node_and_object(newnode,projDonor) newgra._node_pssm[newnode] = donor.pssm_score else: newnode = ( node[0], node[1], donor.pos ) newgra.add_node_and_object(newnode,donor) newgra._node_pssm[newnode] = donor.pssm_score for nodeA,nodeB in newgra.pairwisecrosscombinations_node(): newgra.add_edge(nodeA,nodeB,wt=1.0,entropy=1.0) # return the donorsitecollection return newgra
def harvest_elegiable_donor_sites(self,projected_donors={},forced_codingblock_ends={},next=None, store_all_projected_sites=False, allow_phase_shift=False, enlarge_5p_boundary_by=None, # in AA coordinates enlarge_3p_boundary_by=None, # in AA coordinates ALIGNED_DONOR_MAX_TRIPLET_DISTANCE=None, MIN_DONOR_PSSM_SCORE=None,ALLOW_NON_CANONICAL_DONOR=False, NON_CANONICAL_MIN_DONOR_PSSM_SCORE=None ): """ Harvest elegiable donor sites from this CodingBlockGraph into a DonorSiteCollectionGraph """ if next and next.__class__.__name__ not in ["CodingBlockGraph","LowSimilarityRegionCodingBlockGraph"]: message = "next must be a CodingBlock graph object, not a %s" % next.__class__.__name__ raise InproperlyAppliedArgument, message # update minimal pssm score to stg collection object stg = DonorSiteCollectionGraph() stg.MIN_PSSM_SCORE = MIN_DONOR_PSSM_SCORE stg.ALIGNED_SITE_AA_OFFSET = ALIGNED_DONOR_MAX_TRIPLET_DISTANCE # First, process each individual organism. # (A) obtain elegiable splice site range # (B) scan for splice sites # (C) add the projected sites to the graph # (D) add splice sites to the stg collection graph for org in self.organism_set(): # take the first (and only) orf of this organism theorf = self.get_orfs_of_graph(organism=org)[0] if forced_codingblock_ends.has_key(org): # the node that represents this site cbgEnd = forced_codingblock_ends[org] cbgEndNode = ( org,theorf.id,cbgEnd.pos ) # add to the collection graph stg.add_node_and_object(cbgEndNode,cbgEnd) # ready with this organism, no splice site setting! #continue if next.__class__.__name__ == "LowSimilarityRegionCodingBlockGraph": # continue; all `donor` boundaries are hard-set # no splice_site_range or actual site prediction needed continue ######################################################################## ### get the considered splice site range ######################################################################## # calculate considered splice site range based on EOF Orf object # take theorf.endPY + 2 (two) !, because EOF Orf is the start of the # STOP codon. Example: # ... tca TAG tac gtc ... # ... tca EOF Orf # TAG STOP codon # ..a taG Tac gt. perfect DONOR Site; PSSM-score ~7.7 # calculate considered splice site range based on EOF Orf object (min_aa_pos, min_nt_pos) = self.minimal_eligable_donor_site_position(org) (max_aa_pos, max_nt_pos) = (theorf.endPY+2)/3, theorf.endPY+2 if next and org in next.organism_set(): (next_max_aa_pos, next_max_nt_pos) = self.maximal_eligable_donor_site_position(org,nextcbg=next) else: (next_max_aa_pos, next_max_nt_pos) = self.maximal_eligable_donor_site_position(org) if next_max_nt_pos < max_nt_pos: # minimal range falls within the orf's start point (max_aa_pos, max_nt_pos) = (next_max_aa_pos, next_max_nt_pos) if enlarge_5p_boundary_by: min_aa_pos = min_aa_pos - enlarge_5p_boundary_by min_nt_pos = min_nt_pos - (enlarge_5p_boundary_by*3) if enlarge_3p_boundary_by: max_aa_pos = max_aa_pos + enlarge_3p_boundary_by max_nt_pos = max_nt_pos + (enlarge_3p_boundary_by*3) # set range to stg Collection objects stg.set_consideredsplicesiterange(org,min_nt_pos,max_nt_pos) if forced_codingblock_ends.has_key(org): # ready with this organism, no splice site setting! continue ######################################################################## ### obtain splice sites for current collection ######################################################################## # scan for splice sites theorf.scan_orf_for_pssm_splice_sites(splicetype="donor", min_pssm_score=MIN_DONOR_PSSM_SCORE,allow_non_canonical=ALLOW_NON_CANONICAL_DONOR, non_canonical_min_pssm_score=NON_CANONICAL_MIN_DONOR_PSSM_SCORE, forced=True) # first, add the projected splicesites (they overrule true sites) if projected_donors.has_key(org): for projsite in projected_donors[org]: # check if we can ignore this site if not store_all_projected_sites: if projsite.pos < min_nt_pos: continue if max_nt_pos and projsite.pos > max_nt_pos: continue # create and add this projected site! projNode = ( org,theorf.id,projsite.pos ) stg.add_node_and_object(projNode,projsite) # add the splice sites to the graph for dsq in theorf._donor_sites: if org == 'mgg' and theorf.id == 98: print dsq, dsq.pos, max_nt_pos # check if we can ignore this site if dsq.pos < min_nt_pos: continue if max_nt_pos and dsq.pos > max_nt_pos: continue # the node that represents this site dsqNode = ( org,theorf.id,dsq.pos ) # check if this splice site is not already added as a projected site if dsqNode not in stg.get_nodes(): stg.add_node_and_object(dsqNode,dsq) # now loop over all aligned combinations of organisms for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems(): # only proces this combination if both organisms have splice sites! if g1 not in stg.organism_set(): continue if g2 not in stg.organism_set(): continue # now loop over all donor sites in Query and Sbjct # and align them in a graph; an edge is added if 2 sites # are less then ``ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3`` apart from each other for dsq in stg.get_organism_objects(g1): # the node that represents this site dsqNode = ( g1,o1,dsq.pos ) dsqClass = dsq.__class__.__name__ for dss in stg.get_organism_objects(g2): # the node that represents this site dssNode = ( g2,o2,dss.pos ) dssClass = dss.__class__.__name__ if 'CodingBlockEnd' in [ dsqClass,dssClass ]: if dsqClass == dssClass: # both CodingBlockEnd objects dist = 0 else: # calculate the distance in aligned nt positions dist = pacbporf.get_distance_aligned_nucleotide_positions( query = dsq.pos, sbjct = dss.pos ) # check for the distance constrain if dist > ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3: continue else: # Both Donor sites; check for phase compatibility if not allow_phase_shift and dsq.phase != dss.phase: continue # calculate the distance in aligned nt positions dist = pacbporf.get_distance_aligned_nucleotide_positions( query = dsq.pos, sbjct = dss.pos ) if dsq.phase == dss.phase: # ignore uniformly aligned sites here pass elif allow_phase_shift and dist <= MAX_SPLICE_SITE_PHASE_SHIFT_NT_DISTANCE and\ dsq.phase != dss.phase and min([dsq.pssm_score, dss.pssm_score ]) >= MIN_DONOR_SITE_PHASE_SHIFT_PSSM_SCORE: #print "PhaseShift:", dist, (g1,dsq.pos), (g2,dss.pos), min([dsq.pssm_score, dss.pssm_score ]) pass # a potential splice site phase shift else: continue # check for the distance constrain for sites with uniform phase if dist > ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3: continue # calculate binary entropies from Query if dsqClass == 'SpliceDonor': dsqPositionPos, phaseQ = pacbporf.dnaposition_query(dsq.pos,forced_return=True) entropyQ = pacbporf.alignment_entropy(dsqPositionPos,method='donor') elif dsqClass == 'ProjectedSpliceDonor': entropyQ = dsq.entropy elif dsqClass == 'CodingBlockEnd': entropyQ = 1.0 else: raise "NOT in [ SpliceDonor, ProjectedSpliceDonor, CodingBlockEnd ]" # calculate binary entropies from Sbjct if dssClass == 'SpliceDonor': dssPositionPos, phaseS = pacbporf.dnaposition_query(dss.pos,forced_return=True) entropyS = pacbporf.alignment_entropy(dssPositionPos,method='donor') elif dssClass == 'ProjectedSpliceDonor': entropyS = dss.entropy elif dssClass == 'CodingBlockEnd': entropyS = 1.0 else: raise "NOT in [ SpliceDonor, ProjectedSpliceDonor, CodingBlockEnd ]" # if here, then we have an aligned splice site! # calculate weight from distance, add edge and binary entropy values wt = 1.0 / ( 1.0 + float(dist/3) ) stg.add_edge(dsqNode,dssNode,wt=wt) stg._edge_binary_entropies[(dsqNode,dssNode)] = (entropyQ,entropyS) stg._edge_binary_entropies[(dssNode,dsqNode)] = (entropyS,entropyQ) # return filled splicesitecollection graph return stg
def harvest_elegiable_donor_sites(self,projected_donors={},forced_codingblock_ends={},next=None, store_all_projected_sites=False, allow_phase_shift=False, enlarge_5p_boundary_by=None, # in AA coordinates enlarge_3p_boundary_by=None, # in AA coordinates ALIGNED_DONOR_MAX_TRIPLET_DISTANCE=None, MIN_DONOR_PSSM_SCORE=None,ALLOW_NON_CANONICAL_DONOR=False, NON_CANONICAL_MIN_DONOR_PSSM_SCORE=None ): """ Harvest elegiable donor sites from this CodingBlockGraph into a DonorSiteCollectionGraph """ if next and next.__class__.__name__ not in ["CodingBlockGraph","LowSimilarityRegionCodingBlockGraph"]: message = "next must be a CodingBlock graph object, not a %s" % next.__class__.__name__ raise InproperlyAppliedArgument, message # update minimal pssm score to stg collection object stg = DonorSiteCollectionGraph() stg.MIN_PSSM_SCORE = MIN_DONOR_PSSM_SCORE stg.ALIGNED_SITE_AA_OFFSET = ALIGNED_DONOR_MAX_TRIPLET_DISTANCE # First, process each individual organism. # (A) obtain elegiable splice site range # (B) scan for splice sites # (C) add the projected sites to the graph # (D) add splice sites to the stg collection graph for org in self.organism_set(): # take the first (and only) orf of this organism theorf = self.get_orfs_of_graph(organism=org)[0] if forced_codingblock_ends.has_key(org): # the node that represents this site cbgEnd = forced_codingblock_ends[org] cbgEndNode = ( org,theorf.id,cbgEnd.pos ) # add to the collection graph stg.add_node_and_object(cbgEndNode,cbgEnd) # ready with this organism, no splice site setting! #continue if next.__class__.__name__ == "LowSimilarityRegionCodingBlockGraph": # continue; all `donor` boundaries are hard-set # no splice_site_range or actual site prediction needed continue ######################################################################## ### get the considered splice site range ######################################################################## # calculate considered splice site range based on EOF Orf object # take theorf.endPY + 2 (two) !, because EOF Orf is the start of the # STOP codon. Example: # ... tca TAG tac gtc ... # ... tca EOF Orf # TAG STOP codon # ..a taG Tac gt. perfect DONOR Site; PSSM-score ~7.7 # calculate considered splice site range based on EOF Orf object (min_aa_pos, min_nt_pos) = self.minimal_eligable_donor_site_position(org) (max_aa_pos, max_nt_pos) = (theorf.endPY+2)/3, theorf.endPY+2 if next and org in next.organism_set(): (next_max_aa_pos, next_max_nt_pos) = self.maximal_eligable_donor_site_position(org,nextcbg=next) else: (next_max_aa_pos, next_max_nt_pos) = self.maximal_eligable_donor_site_position(org) if next_max_nt_pos < max_nt_pos: # minimal range falls within the orf's start point (max_aa_pos, max_nt_pos) = (next_max_aa_pos, next_max_nt_pos) if enlarge_5p_boundary_by: min_aa_pos = min_aa_pos - enlarge_5p_boundary_by min_nt_pos = min_nt_pos - (enlarge_5p_boundary_by*3) if enlarge_3p_boundary_by: max_aa_pos = max_aa_pos + enlarge_3p_boundary_by max_nt_pos = max_nt_pos + (enlarge_3p_boundary_by*3) # set range to stg Collection objects stg.set_consideredsplicesiterange(org,min_nt_pos,max_nt_pos) if forced_codingblock_ends.has_key(org): # ready with this organism, no splice site setting! continue ######################################################################## ### obtain splice sites for current collection ######################################################################## # scan for splice sites theorf.scan_orf_for_pssm_splice_sites(splicetype="donor", min_pssm_score=MIN_DONOR_PSSM_SCORE,allow_non_canonical=ALLOW_NON_CANONICAL_DONOR, non_canonical_min_pssm_score=NON_CANONICAL_MIN_DONOR_PSSM_SCORE, forced=True) # first, add the projected splicesites (they overrule true sites) if projected_donors.has_key(org): for projsite in projected_donors[org]: # check if we can ignore this site if not store_all_projected_sites: if projsite.pos < min_nt_pos: continue if max_nt_pos and projsite.pos > max_nt_pos: continue # create and add this projected site! projNode = ( org,theorf.id,projsite.pos ) stg.add_node_and_object(projNode,projsite) # add the splice sites to the graph for dsq in theorf._donor_sites: # check if we can ignore this site if dsq.pos < min_nt_pos: continue if max_nt_pos and dsq.pos > max_nt_pos: continue # the node that represents this site dsqNode = ( org,theorf.id,dsq.pos ) # check if this splice site is not already added as a projected site if dsqNode not in stg.get_nodes(): stg.add_node_and_object(dsqNode,dsq) # now loop over all aligned combinations of organisms for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems(): # only proces this combination if both organisms have splice sites! if g1 not in stg.organism_set(): continue if g2 not in stg.organism_set(): continue # now loop over all donor sites in Query and Sbjct # and align them in a graph; an edge is added if 2 sites # are less then ``ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3`` apart from each other for dsq in stg.get_organism_objects(g1): # the node that represents this site dsqNode = ( g1,o1,dsq.pos ) dsqClass = dsq.__class__.__name__ for dss in stg.get_organism_objects(g2): # the node that represents this site dssNode = ( g2,o2,dss.pos ) dssClass = dss.__class__.__name__ if 'CodingBlockEnd' in [ dsqClass,dssClass ]: if dsqClass == dssClass: # both CodingBlockEnd objects dist = 0 else: # calculate the distance in aligned nt positions dist = pacbporf.get_distance_aligned_nucleotide_positions( query = dsq.pos, sbjct = dss.pos ) # check for the distance constrain if dist > ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3: continue else: # Both Donor sites; check for phase compatibility if not allow_phase_shift and dsq.phase != dss.phase: continue # calculate the distance in aligned nt positions dist = pacbporf.get_distance_aligned_nucleotide_positions( query = dsq.pos, sbjct = dss.pos ) if dsq.phase == dss.phase: # ignore uniformly aligned sites here pass elif allow_phase_shift and dist <= MAX_SPLICE_SITE_PHASE_SHIFT_NT_DISTANCE and\ dsq.phase != dss.phase and min([dsq.pssm_score, dss.pssm_score ]) >= MIN_DONOR_SITE_PHASE_SHIFT_PSSM_SCORE: #print "PhaseShift:", dist, (g1,dsq.pos), (g2,dss.pos), min([dsq.pssm_score, dss.pssm_score ]) pass # a potential splice site phase shift else: continue # check for the distance constrain for sites with uniform phase if dist > ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3: continue # calculate binary entropies from Query if dsqClass == 'SpliceDonor': dsqPositionPos, phaseQ = pacbporf.dnaposition_query(dsq.pos,forced_return=True) entropyQ = pacbporf.alignment_entropy(dsqPositionPos,method='donor') elif dsqClass == 'ProjectedSpliceDonor': entropyQ = dsq.entropy elif dsqClass == 'CodingBlockEnd': entropyQ = 1.0 else: raise "NOT in [ SpliceDonor, ProjectedSpliceDonor, CodingBlockEnd ]" # calculate binary entropies from Sbjct if dssClass == 'SpliceDonor': dssPositionPos, phaseS = pacbporf.dnaposition_query(dss.pos,forced_return=True) entropyS = pacbporf.alignment_entropy(dssPositionPos,method='donor') elif dssClass == 'ProjectedSpliceDonor': entropyS = dss.entropy elif dssClass == 'CodingBlockEnd': entropyS = 1.0 else: raise "NOT in [ SpliceDonor, ProjectedSpliceDonor, CodingBlockEnd ]" # if here, then we have an aligned splice site! # calculate weight from distance, add edge and binary entropy values wt = 1.0 / ( 1.0 + float(dist/3) ) stg.add_edge(dsqNode,dssNode,wt=wt) stg._edge_binary_entropies[(dsqNode,dssNode)] = (entropyQ,entropyS) stg._edge_binary_entropies[(dssNode,dsqNode)] = (entropyS,entropyQ) # return filled splicesitecollection graph return stg