Пример #1
0
def scan_pssm_splice_site(seq,splicetype="donor",
    override_pattern_offset=(),min_pssm_score=None,
    allow_non_canonical=False,non_canonical_min_pssm_score=0.0,
    ignore_unambiguity=False,relativescore=False,):
    """
    Find splice sites by a PSSM on input sequence

    @type  seq:  string
    @param seq: DNA sequence of EXACT length of the PSSM

    @type  splicetype:   string
    @param splicetype:  'donor' or 'acceptor'

    @type  min_pssm_score:   float
    @param min_pssm_score:

    @type  allow_non_canonical:  boolean
    @param allow_non_canonical: True of False

    @type  non_canonical_min_pssm_score:   float
    @param non_canonical_min_pssm_score:

    @type  override_pattern_offset:  tuple
    @param override_pattern_offset: tuple with 2 integers; use cautiously!!

    @rtype:  list
    @return: list with SpliceDonors or SpliceAcceptors
    """
    if splicetype == 'acceptor':
        PSSM_MATRIX     = IC_ACCEPTOR
        pattern_offset  = IC_ACCEPTOR_PATTERN_OFFSET 
        canonical       = "AG"
        # initialize Psmm (scoring) class
        canssPssm = Pssm(ic=IC_ACCEPTOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore)
        # import output SpliceAcceptor object
        from acceptor import SpliceAcceptor
    elif splicetype == 'donor':
        PSSM_MATRIX     = IC_DONOR
        pattern_offset  = IC_DONOR_PATTERN_OFFSET 
        canonical       = "GT"
        # initialize Psmm (scoring) class
        canssPssm = Pssm(ic=IC_DONOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore)
        # import output SpliceDonor object
        from donor import SpliceDonor
    else:
        message = "'splicetype' (%s) not in [donor,acceptor]" % splicetype
        raise InproperlyAppliedArgument, message

    if allow_non_canonical:
        # obtain PSSM_IC for non-canonical (GC) donors
        IC_NCGC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE)
        noncanonical = ["GC"]
        # initialize Psmm (scoring) class
        noncanssPssm = Pssm(ic=IC_NCGC_DONOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore)


    # hmm... somebody knows what he or she is doing ;-)
    if override_pattern_offset:
        pattern_offset = override_pattern_offset

    pssmlength = len(PSSM_MATRIX)
    sites = []
    for offset in range(0, len(seq) - pssmlength + 1 ):
        # get sequence slice of pattern and actual splice site
        seqpart = seq[offset:offset+pssmlength].upper()
        splicesite = seqpart[pattern_offset[0]:-pattern_offset[1]]

        # continue if non-canonical sites if not requested for
        if not allow_non_canonical and splicesite != canonical:
            continue
        elif splicesite == canonical:
            # score this splicesite
            #score = _score_splice_site(seqpart,splicetype=splicetype)
            score = canssPssm.score(seqpart)
            # check if site must be stored
            if min_pssm_score or min_pssm_score == 0.0:
                if score < min_pssm_score:
                    continue
        elif splicesite != canonical and splicetype == 'donor' and splicesite in noncanonical:
            # score non-canonical donor site
            #score = pssmscore(seqpart,IC_NCGC_DONOR) 
            score = noncanssPssm.score(seqpart)
            # check if site must be stored
            if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0:
                if score < non_canonical_min_pssm_score:
                    continue
            ####print seqpart, score, offset
        else:
            continue

        if splicetype=='acceptor':
            a = SpliceAcceptor(offset,seqpart,acceptor=splicesite,pssm_score=score)
            sites.append(a)
        else:
            d = SpliceDonor(offset,seqpart,donor=splicesite,pssm_score=score)
            sites.append(d)

    # return sites for Donor
    if splicetype == 'donor':
        sites.reverse()

    # and return
    return sites
Пример #2
0
import sys
from pssm import parse_ic_data, parse_ic_file

if len(sys.argv) == 2:
    IC = parse_ic_file(sys.argv[1])
else:
    ic_data = []
    for line in sys.stdin.readlines():
        if line[0] == '#': continue
        ic_data.append(line.strip())
    IC = parse_ic_data("\n".join(ic_data))


buffer = []    
num_seqs = 1000
for col in range(len(IC)):
    buffer.append([])
    vdict = IC[col]
    for base,value in vdict.iteritems():
        freq = pow(2,value-2.0)
        cnt = int(round(freq*num_seqs))
        buffer[-1].extend( [ base ]*cnt )
    while len(buffer[-1]) < num_seqs:
        buffer[-1].append("n")

        
seqs = {}
cnt=1
while buffer[0]:
    seq = "".join( [ buffer[col].pop() for col in range(len(IC)) ] )
    seqs[cnt] = seq
Пример #3
0
    IncompatibleSpliceSitePhases,
    )
from pssm import parse_ic_file, Pssm, pssmscore


# Import Global variables
from settings.splicesites import (
    IC_DONOR_PATTERN_OFFSET,
    IC_DONOR_DATA_FILE,
    IC_DONOR_NCGC_DATA_FILE,
    IC_ACCEPTOR_PATTERN_OFFSET,
    IC_ACCEPTOR_DATA_FILE,
    )

# parse IC PSSM files of cannonical sites
IC_ACCEPTOR    = parse_ic_file(IC_ACCEPTOR_DATA_FILE)
IC_DONOR       = parse_ic_file(IC_DONOR_DATA_FILE)
IC_NC_GC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE)

class SpliceSiteBase(BasicGFF):
    """ """
    def __init__(self,start,phase=None,strand='+',pattern=None,
        pattern_offset=(0,0),pssm_score=None,gff={}):
        """
        Initialization function of Basal SpliceSite logic
        Recommended is to use only one of the inheriting classes

        @type  start: number
    	@param start: start coord of site (e.g GT) or pattern (e.g. tgtGTcgat)

        @type  phase: number
Пример #4
0
import sys
from pssm import parse_ic_data, parse_ic_file

if len(sys.argv) == 2:
    IC = parse_ic_file(sys.argv[1])
else:
    ic_data = []
    for line in sys.stdin.readlines():
        if line[0] == '#': continue
        ic_data.append(line.strip())
    IC = parse_ic_data("\n".join(ic_data))

buffer = []
num_seqs = 1000
for col in range(len(IC)):
    buffer.append([])
    vdict = IC[col]
    for base, value in vdict.iteritems():
        freq = pow(2, value - 2.0)
        cnt = int(round(freq * num_seqs))
        buffer[-1].extend([base] * cnt)
    while len(buffer[-1]) < num_seqs:
        buffer[-1].append("n")

seqs = {}
cnt = 1
while buffer[0]:
    seq = "".join([buffer[col].pop() for col in range(len(IC))])
    seqs[cnt] = seq
    cnt += 1
Пример #5
0
def scan_pssm_splice_site(
    seq,
    splicetype="donor",
    override_pattern_offset=(),
    min_pssm_score=None,
    allow_non_canonical=False,
    non_canonical_min_pssm_score=0.0,
    ignore_unambiguity=False,
    relativescore=False,
):
    """
    Find splice sites by a PSSM on input sequence

    @type  seq:  string
    @param seq: DNA sequence of EXACT length of the PSSM

    @type  splicetype:   string
    @param splicetype:  'donor' or 'acceptor'

    @type  min_pssm_score:   float
    @param min_pssm_score:

    @type  allow_non_canonical:  boolean
    @param allow_non_canonical: True of False

    @type  non_canonical_min_pssm_score:   float
    @param non_canonical_min_pssm_score:

    @type  override_pattern_offset:  tuple
    @param override_pattern_offset: tuple with 2 integers; use cautiously!!

    @rtype:  list
    @return: list with SpliceDonors or SpliceAcceptors
    """
    if splicetype == 'acceptor':
        PSSM_MATRIX = IC_ACCEPTOR
        pattern_offset = IC_ACCEPTOR_PATTERN_OFFSET
        canonical = "AG"
        # initialize Psmm (scoring) class
        canssPssm = Pssm(ic=IC_ACCEPTOR,
                         ignore_unambiguity=ignore_unambiguity,
                         relativescore=relativescore)
        # import output SpliceAcceptor object
        from acceptor import SpliceAcceptor
    elif splicetype == 'donor':
        PSSM_MATRIX = IC_DONOR
        pattern_offset = IC_DONOR_PATTERN_OFFSET
        canonical = "GT"
        # initialize Psmm (scoring) class
        canssPssm = Pssm(ic=IC_DONOR,
                         ignore_unambiguity=ignore_unambiguity,
                         relativescore=relativescore)
        # import output SpliceDonor object
        from donor import SpliceDonor
    else:
        message = "'splicetype' (%s) not in [donor,acceptor]" % splicetype
        raise InproperlyAppliedArgument, message

    if allow_non_canonical:
        # obtain PSSM_IC for non-canonical (GC) donors
        IC_NCGC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE)
        noncanonical = ["GC"]
        # initialize Psmm (scoring) class
        noncanssPssm = Pssm(ic=IC_NCGC_DONOR,
                            ignore_unambiguity=ignore_unambiguity,
                            relativescore=relativescore)

    # hmm... somebody knows what he or she is doing ;-)
    if override_pattern_offset:
        pattern_offset = override_pattern_offset

    pssmlength = len(PSSM_MATRIX)
    sites = []
    for offset in range(0, len(seq) - pssmlength + 1):
        # get sequence slice of pattern and actual splice site
        seqpart = seq[offset:offset + pssmlength].upper()
        splicesite = seqpart[pattern_offset[0]:-pattern_offset[1]]

        # continue if non-canonical sites if not requested for
        if not allow_non_canonical and splicesite != canonical:
            continue
        elif splicesite == canonical:
            # score this splicesite
            #score = _score_splice_site(seqpart,splicetype=splicetype)
            score = canssPssm.score(seqpart)
            # check if site must be stored
            if min_pssm_score or min_pssm_score == 0.0:
                if score < min_pssm_score:
                    continue
        elif splicesite != canonical and splicetype == 'donor' and splicesite in noncanonical:
            # score non-canonical donor site
            #score = pssmscore(seqpart,IC_NCGC_DONOR)
            score = noncanssPssm.score(seqpart)
            # check if site must be stored
            if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0:
                if score < non_canonical_min_pssm_score:
                    continue
            ####print seqpart, score, offset
        else:
            continue

        if splicetype == 'acceptor':
            a = SpliceAcceptor(offset,
                               seqpart,
                               acceptor=splicesite,
                               pssm_score=score)
            sites.append(a)
        else:
            d = SpliceDonor(offset,
                            seqpart,
                            donor=splicesite,
                            pssm_score=score)
            sites.append(d)

    # return sites for Donor
    if splicetype == 'donor':
        sites.reverse()

    # and return
    return sites
Пример #6
0
    UnexpectedSpliceSitePhase,
    IncompatibleSpliceSitePhases,
)
from pssm import parse_ic_file, Pssm, pssmscore

# Import Global variables
from settings.splicesites import (
    IC_DONOR_PATTERN_OFFSET,
    IC_DONOR_DATA_FILE,
    IC_DONOR_NCGC_DATA_FILE,
    IC_ACCEPTOR_PATTERN_OFFSET,
    IC_ACCEPTOR_DATA_FILE,
)

# parse IC PSSM files of cannonical sites
IC_ACCEPTOR = parse_ic_file(IC_ACCEPTOR_DATA_FILE)
IC_DONOR = parse_ic_file(IC_DONOR_DATA_FILE)
IC_NC_GC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE)


class SpliceSiteBase(BasicGFF):
    """ """
    def __init__(self,
                 start,
                 phase=None,
                 strand='+',
                 pattern=None,
                 pattern_offset=(0, 0),
                 pssm_score=None,
                 gff={}):
        """
Пример #7
0
# Python Imports
from re import finditer, compile
from copy import deepcopy

# Import Global variables
from settings.translationalstartsites import (
    TSS_MIN_PSSM_SCORE,
    TSS_ALLOW_NON_CANONICAL,
    TSS_NON_CANONICAL_MIN_PSSM_SCORE,
    IC_TSS_DATA_FILE,
    IC_TSS_PATTERN_OFFSET,
    )

# parse IC PSSM file of TSS
IC_TSS = parse_ic_file(IC_TSS_DATA_FILE)



class StartCodon(BasicGFF):
    def __init__(self,pos,gff={}):
        """ """
        BasicGFF.__init__(self)
        self._gff.update(gff)
        self.pos        = pos
        self.start      = self.pos
        self.end        = self.start+3
        self.pssm_score = 1.0  # default, dummy value
        self.phase      = 0
    # end of function __init__