def harvest_elegiable_tss_sites(self,max_aa_distance=ALIGNED_TSS_MAX_AA_DISTANCE,
    tss_min_pssm_score=TSS_MIN_PSSM_SCORE,
    skip_nonelegiable_sites=True):
    """
    """
    # update minimal pssm score to stg collection object
    stg = TranslationalStartSiteCollectionGraph()
    stg.MIN_PSSM_SCORE = tss_min_pssm_score 
    stg.ALIGNED_SITE_AA_OFFSET = max_aa_distance


    # First, proces each individual organism.
    for org in self.organism_set():
        # take the first (and only) orf of this organism
        theorf = self.get_orfs_of_graph(organism=org)[0]
        # ready if there are no potential tss loci (no ATG sequence)
        if not theorf.has_start(): continue
        # scan for tss loci
        theorf.scan_orf_for_pssm_tss(min_pssm_score=tss_min_pssm_score)


        if skip_nonelegiable_sites:
            # get the considered TSS range
            (min_aa_pos, min_nt_pos) = self.minimal_eligable_tss_position(org)
            (max_aa_pos, max_nt_pos) = self.maximal_eligable_tss_position(org)
        else:
            (min_aa_pos, min_nt_pos) = None, None
            (max_aa_pos, max_nt_pos) = None, None

        for tss in theorf._tss_sites:
            # check if we can ignore this site
            if min_nt_pos and tss.pos < min_nt_pos: continue
            if max_nt_pos and tss.pos > max_nt_pos: continue
            # an accepted site; add to TSS Collection Graph
            startpos = tss.pos / 3
            tssNode = ( org, theorf.id, startpos, tss.pos )
            stg.add_node_and_object(tssNode,tss)


    # Second, evaluate all cross combinations
    for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems():
        # only proces this combination if both organisms have splice sites!
        if g1 not in stg.organism_set(): continue
        if g2 not in stg.organism_set(): continue

        # now loop over all TSS in Query and Sbjct
        # and align them in a graph; an edge is added if 2 sites
        # are less then ``max_aa_distance`` apart from each other
        for tssQ in stg.get_organism_objects(g1):
            # the node that represents this site
            startQpos = tssQ.pos / 3
            startQnode  = ( g1, o1, startQpos, tssQ.pos )
            for tssS in stg.get_organism_objects(g2):
                # the node that represents this site
                startSpos = tssS.pos / 3
                startSnode = ( g2, o2, startSpos, tssS.pos )

                # get distance between (aligned) start-codons
                dist = pacbporf.get_distance_aligned_protein_positions(
                        query=startQpos,sbjct=startSpos)

                # continue if distance between start sites is to big
                if dist > max_aa_distance: continue

                # calculate binary entropies from both positions
                startQpositionPos,phaseQ = pacbporf.dnaposition_query(tssQ.pos,forced_return=True)
                startSpositionPos,phaseS = pacbporf.dnaposition_sbjct(tssS.pos,forced_return=True)
                entropyQ = pacbporf.alignment_entropy(startQpositionPos,method='left')
                entropyS = pacbporf.alignment_entropy(startSpositionPos,method='left')

                # calculate a weight from distance between startQpos and startSpos
                wt = 1.0 / ( 1.0 + float(dist) )

                # check if edge already in graph
                if stg.has_edge( startQnode, startSnode ):
                    _wt = stg.weights[( startQnode, startSnode )]
                    if wt > _wt:
                        stg.set_edge_weight( startQnode, startSnode, wt=wt )
                        # and add binary entropy values
                        stg._edge_binary_entropies[(startQnode, startSnode)] = (entropyQ,entropyS)
                        stg._edge_binary_entropies[(startSnode, startQnode)] = (entropyS,entropyQ)
                else:
                    stg.add_edge( startQnode, startSnode, wt=wt )
                    # and add binary entropy values
                    stg._edge_binary_entropies[(startQnode, startSnode)] = (entropyQ,entropyS)
                    stg._edge_binary_entropies[(startSnode, startQnode)] = (entropyS,entropyQ)


    # Get tcode data for these start codon nodes
    # Assuming that this is indeed the start-codon,
    # the stretch of ATG untill max(OMSR) will be coding.
    # Take the length of this stretch (in nt) as right/3p/upstream window size
    omsr = self.overall_minimal_spanning_range()
    for (org,orfid,aaPos,dnaPos) in stg.get_nodes():
        theorf = self.get_orfs_of_graph(organism=org)[0]
        right_window_size = ( max(omsr[(org,orfid)])+1 - aaPos )*3
        # confirm that window size is not < 0; this is possible
        # once the Methionine/TSS is located downstream of the
        # OMSR max site
        if right_window_size <= 0:
            right_window_size = stg._TCODE_3P_WINDOWSIZE
        # calculate the average TCODE scores for the windows
        ( tcode5p,tcode3p ) = theorf.tcode_entropy_of_pos(
                aaPos,
                window_left=stg._TCODE_5P_WINDOWSIZE,
                window_right=right_window_size,
                )
        stg._tcode5pscore[(org,orfid,aaPos,dnaPos)] = tcode5p
        stg._tcode3pscore[(org,orfid,aaPos,dnaPos)] = tcode3p

    # return filled tsscollection graph
    return stg
示例#2
0
def ExonCollectionGraph2TranslationalStartSiteCollectionGraph(gra):
    """
    Convert ECG -> TranslationalStartSiteCollectionGraph

    @attention: only in use when ECG is a FirstExon ECG

    @rtype:  TranslationalStartSiteCollectionGraph
    @return: TranslationalStartSiteCollectionGraph instance to be placed in the CBG
    """
    newgra = TranslationalStartSiteCollectionGraph()
    newgra.ALIGNED_SITE_AA_OFFSET = 10
    newgra.MIN_PSSM_SCORE = -0.0
    for node in gra.get_nodes():
        # exon node is ( org, orf, ntpos), TSS node ( org, orf, aapos, ntpos )
        newnode = ( node[0], node[1], node[2]/3, node[2] )
        tss = gra._node_object[node].acceptor
        newgra.add_node_and_object(newnode,tss)
        newgra._node_pssm[newnode] = tss.pssm_score
    for nodeA,nodeB in newgra.pairwisecrosscombinations_node():
        entropyQ = 1.0
        entropyS = 1.0
        newgra.add_edge(nodeA,nodeB,wt=1.0)
        newgra._edge_binary_entropies[(nodeA,nodeB)] = (entropyQ,entropyS)
        newgra._edge_binary_entropies[(nodeB,nodeA)] = (entropyS,entropyQ)

    # Get tcode data for these start codon nodes
    # Assuming that this is indeed the start-codon,
    for (org,orfid,aaPos,dnaPos) in newgra.get_nodes():
        theorf = gra.get_orfs_of_graph(organism=org)[0]
        # calculate the average TCODE scores for the windows
        ( tcode5p,tcode3p ) = theorf.tcode_entropy_of_pos(
                aaPos,
                window_left=newgra._TCODE_5P_WINDOWSIZE,
                window_right=newgra._TCODE_3P_WINDOWSIZE,
                )
        newgra._tcode5pscore[(org,orfid,aaPos,dnaPos)] = tcode5p
        newgra._tcode3pscore[(org,orfid,aaPos,dnaPos)] = tcode3p
    # return the TranslationalStartSiteCollectionGraph
    return newgra
示例#3
0
def harvest_elegiable_tss_sites(self,max_aa_distance=ALIGNED_TSS_MAX_AA_DISTANCE,
    tss_min_pssm_score=TSS_MIN_PSSM_SCORE,
    skip_nonelegiable_sites=True):
    """
    """
    # update minimal pssm score to stg collection object
    stg = TranslationalStartSiteCollectionGraph()
    stg.MIN_PSSM_SCORE = tss_min_pssm_score 
    stg.ALIGNED_SITE_AA_OFFSET = max_aa_distance


    # First, proces each individual organism.
    for org in self.organism_set():
        # take the first (and only) orf of this organism
        theorf = self.get_orfs_of_graph(organism=org)[0]
        # ready if there are no potential tss loci (no ATG sequence)
        if not theorf.has_start(): continue
        # scan for tss loci
        theorf.scan_orf_for_pssm_tss(min_pssm_score=tss_min_pssm_score)


        if skip_nonelegiable_sites:
            # get the considered TSS range
            (min_aa_pos, min_nt_pos) = self.minimal_eligable_tss_position(org)
            (max_aa_pos, max_nt_pos) = self.maximal_eligable_tss_position(org)
        else:
            (min_aa_pos, min_nt_pos) = None, None
            (max_aa_pos, max_nt_pos) = None, None

        for tss in theorf._tss_sites:
            # check if we can ignore this site
            if min_nt_pos and tss.pos < min_nt_pos: continue
            if max_nt_pos and tss.pos > max_nt_pos: continue
            # an accepted site; add to TSS Collection Graph
            startpos = tss.pos / 3
            tssNode = ( org, theorf.id, startpos, tss.pos )
            stg.add_node_and_object(tssNode,tss)


    # Second, evaluate all cross combinations
    for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems():
        # only proces this combination if both organisms have splice sites!
        if g1 not in stg.organism_set(): continue
        if g2 not in stg.organism_set(): continue

        # now loop over all TSS in Query and Sbjct
        # and align them in a graph; an edge is added if 2 sites
        # are less then ``max_aa_distance`` apart from each other
        for tssQ in stg.get_organism_objects(g1):
            # the node that represents this site
            startQpos = tssQ.pos / 3
            startQnode  = ( g1, o1, startQpos, tssQ.pos )
            for tssS in stg.get_organism_objects(g2):
                # the node that represents this site
                startSpos = tssS.pos / 3
                startSnode = ( g2, o2, startSpos, tssS.pos )

                # get distance between (aligned) start-codons
                dist = pacbporf.get_distance_aligned_protein_positions(
                        query=startQpos,sbjct=startSpos)

                # continue if distance between start sites is to big
                if dist > max_aa_distance: continue

                # calculate binary entropies from both positions
                startQpositionPos,phaseQ = pacbporf.dnaposition_query(tssQ.pos,forced_return=True)
                startSpositionPos,phaseS = pacbporf.dnaposition_sbjct(tssS.pos,forced_return=True)
                entropyQ = pacbporf.alignment_entropy(startQpositionPos,method='left')
                entropyS = pacbporf.alignment_entropy(startSpositionPos,method='left')

                # calculate a weight from distance between startQpos and startSpos
                wt = 1.0 / ( 1.0 + float(dist) )

                # check if edge already in graph
                if stg.has_edge( startQnode, startSnode ):
                    _wt = stg.weights[( startQnode, startSnode )]
                    if wt > _wt:
                        stg.set_edge_weight( startQnode, startSnode, wt=wt )
                        # and add binary entropy values
                        stg._edge_binary_entropies[(startQnode, startSnode)] = (entropyQ,entropyS)
                        stg._edge_binary_entropies[(startSnode, startQnode)] = (entropyS,entropyQ)
                else:
                    stg.add_edge( startQnode, startSnode, wt=wt )
                    # and add binary entropy values
                    stg._edge_binary_entropies[(startQnode, startSnode)] = (entropyQ,entropyS)
                    stg._edge_binary_entropies[(startSnode, startQnode)] = (entropyS,entropyQ)


    # Get tcode data for these start codon nodes
    # Assuming that this is indeed the start-codon,
    # the stretch of ATG untill max(OMSR) will be coding.
    # Take the length of this stretch (in nt) as right/3p/upstream window size
    omsr = self.overall_minimal_spanning_range()
    for (org,orfid,aaPos,dnaPos) in stg.get_nodes():
        theorf = self.get_orfs_of_graph(organism=org)[0]
        right_window_size = ( max(omsr[(org,orfid)])+1 - aaPos )*3
        # confirm that window size is not < 0; this is possible
        # once the Methionine/TSS is located downstream of the
        # OMSR max site
        if right_window_size <= 0:
            right_window_size = stg._TCODE_3P_WINDOWSIZE
        # calculate the average TCODE scores for the windows
        ( tcode5p,tcode3p ) = theorf.tcode_entropy_of_pos(
                aaPos,
                window_left=stg._TCODE_5P_WINDOWSIZE,
                window_right=right_window_size,
                )
        stg._tcode5pscore[(org,orfid,aaPos,dnaPos)] = tcode5p
        stg._tcode3pscore[(org,orfid,aaPos,dnaPos)] = tcode3p

    # return filled tsscollection graph
    return stg