def harvest_elegiable_tss_sites(self,max_aa_distance=ALIGNED_TSS_MAX_AA_DISTANCE, tss_min_pssm_score=TSS_MIN_PSSM_SCORE, skip_nonelegiable_sites=True): """ """ # update minimal pssm score to stg collection object stg = TranslationalStartSiteCollectionGraph() stg.MIN_PSSM_SCORE = tss_min_pssm_score stg.ALIGNED_SITE_AA_OFFSET = max_aa_distance # First, proces each individual organism. for org in self.organism_set(): # take the first (and only) orf of this organism theorf = self.get_orfs_of_graph(organism=org)[0] # ready if there are no potential tss loci (no ATG sequence) if not theorf.has_start(): continue # scan for tss loci theorf.scan_orf_for_pssm_tss(min_pssm_score=tss_min_pssm_score) if skip_nonelegiable_sites: # get the considered TSS range (min_aa_pos, min_nt_pos) = self.minimal_eligable_tss_position(org) (max_aa_pos, max_nt_pos) = self.maximal_eligable_tss_position(org) else: (min_aa_pos, min_nt_pos) = None, None (max_aa_pos, max_nt_pos) = None, None for tss in theorf._tss_sites: # check if we can ignore this site if min_nt_pos and tss.pos < min_nt_pos: continue if max_nt_pos and tss.pos > max_nt_pos: continue # an accepted site; add to TSS Collection Graph startpos = tss.pos / 3 tssNode = ( org, theorf.id, startpos, tss.pos ) stg.add_node_and_object(tssNode,tss) # Second, evaluate all cross combinations for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems(): # only proces this combination if both organisms have splice sites! if g1 not in stg.organism_set(): continue if g2 not in stg.organism_set(): continue # now loop over all TSS in Query and Sbjct # and align them in a graph; an edge is added if 2 sites # are less then ``max_aa_distance`` apart from each other for tssQ in stg.get_organism_objects(g1): # the node that represents this site startQpos = tssQ.pos / 3 startQnode = ( g1, o1, startQpos, tssQ.pos ) for tssS in stg.get_organism_objects(g2): # the node that represents this site startSpos = tssS.pos / 3 startSnode = ( g2, o2, startSpos, tssS.pos ) # get distance between (aligned) start-codons dist = pacbporf.get_distance_aligned_protein_positions( query=startQpos,sbjct=startSpos) # continue if distance between start sites is to big if dist > max_aa_distance: continue # calculate binary entropies from both positions startQpositionPos,phaseQ = pacbporf.dnaposition_query(tssQ.pos,forced_return=True) startSpositionPos,phaseS = pacbporf.dnaposition_sbjct(tssS.pos,forced_return=True) entropyQ = pacbporf.alignment_entropy(startQpositionPos,method='left') entropyS = pacbporf.alignment_entropy(startSpositionPos,method='left') # calculate a weight from distance between startQpos and startSpos wt = 1.0 / ( 1.0 + float(dist) ) # check if edge already in graph if stg.has_edge( startQnode, startSnode ): _wt = stg.weights[( startQnode, startSnode )] if wt > _wt: stg.set_edge_weight( startQnode, startSnode, wt=wt ) # and add binary entropy values stg._edge_binary_entropies[(startQnode, startSnode)] = (entropyQ,entropyS) stg._edge_binary_entropies[(startSnode, startQnode)] = (entropyS,entropyQ) else: stg.add_edge( startQnode, startSnode, wt=wt ) # and add binary entropy values stg._edge_binary_entropies[(startQnode, startSnode)] = (entropyQ,entropyS) stg._edge_binary_entropies[(startSnode, startQnode)] = (entropyS,entropyQ) # Get tcode data for these start codon nodes # Assuming that this is indeed the start-codon, # the stretch of ATG untill max(OMSR) will be coding. # Take the length of this stretch (in nt) as right/3p/upstream window size omsr = self.overall_minimal_spanning_range() for (org,orfid,aaPos,dnaPos) in stg.get_nodes(): theorf = self.get_orfs_of_graph(organism=org)[0] right_window_size = ( max(omsr[(org,orfid)])+1 - aaPos )*3 # confirm that window size is not < 0; this is possible # once the Methionine/TSS is located downstream of the # OMSR max site if right_window_size <= 0: right_window_size = stg._TCODE_3P_WINDOWSIZE # calculate the average TCODE scores for the windows ( tcode5p,tcode3p ) = theorf.tcode_entropy_of_pos( aaPos, window_left=stg._TCODE_5P_WINDOWSIZE, window_right=right_window_size, ) stg._tcode5pscore[(org,orfid,aaPos,dnaPos)] = tcode5p stg._tcode3pscore[(org,orfid,aaPos,dnaPos)] = tcode3p # return filled tsscollection graph return stg
def ExonCollectionGraph2TranslationalStartSiteCollectionGraph(gra): """ Convert ECG -> TranslationalStartSiteCollectionGraph @attention: only in use when ECG is a FirstExon ECG @rtype: TranslationalStartSiteCollectionGraph @return: TranslationalStartSiteCollectionGraph instance to be placed in the CBG """ newgra = TranslationalStartSiteCollectionGraph() newgra.ALIGNED_SITE_AA_OFFSET = 10 newgra.MIN_PSSM_SCORE = -0.0 for node in gra.get_nodes(): # exon node is ( org, orf, ntpos), TSS node ( org, orf, aapos, ntpos ) newnode = ( node[0], node[1], node[2]/3, node[2] ) tss = gra._node_object[node].acceptor newgra.add_node_and_object(newnode,tss) newgra._node_pssm[newnode] = tss.pssm_score for nodeA,nodeB in newgra.pairwisecrosscombinations_node(): entropyQ = 1.0 entropyS = 1.0 newgra.add_edge(nodeA,nodeB,wt=1.0) newgra._edge_binary_entropies[(nodeA,nodeB)] = (entropyQ,entropyS) newgra._edge_binary_entropies[(nodeB,nodeA)] = (entropyS,entropyQ) # Get tcode data for these start codon nodes # Assuming that this is indeed the start-codon, for (org,orfid,aaPos,dnaPos) in newgra.get_nodes(): theorf = gra.get_orfs_of_graph(organism=org)[0] # calculate the average TCODE scores for the windows ( tcode5p,tcode3p ) = theorf.tcode_entropy_of_pos( aaPos, window_left=newgra._TCODE_5P_WINDOWSIZE, window_right=newgra._TCODE_3P_WINDOWSIZE, ) newgra._tcode5pscore[(org,orfid,aaPos,dnaPos)] = tcode5p newgra._tcode3pscore[(org,orfid,aaPos,dnaPos)] = tcode3p # return the TranslationalStartSiteCollectionGraph return newgra