Пример #1
0
 def expand( self ):
     if not self.mMapOld2New:
         self.mMapOld2New = alignlib_lite.py_makeAlignmentVector()
     
         alignlib_lite.py_AlignmentFormatEmissions( 
             self.mOldFrom, self.mOldAli,
             self.mNewFrom, self.mNewAli).copy( self.mMapOld2New )
Пример #2
0
    def expand(self):
        if not self.mMapOld2New:
            self.mMapOld2New = alignlib_lite.py_makeAlignmentVector()

            alignlib_lite.py_AlignmentFormatEmissions(
                self.mOldFrom, self.mOldAli, self.mNewFrom,
                self.mNewAli).copy(self.mMapOld2New)
Пример #3
0
def getMapPeptide2Cds(peptide_sequence, cds_sequence, options):
    """get map between peptide sequence and cds sequence.

    The returned alignment is in nucleotides.

    """

    # remove whitespaces form protein sequence
    p = re.sub(" ", "", peptide_sequence)

    # remove gaps and whitespaces from cds
    c = re.sub("[ .-]", "", cds_sequence)

    w = Genomics.Protein2Wobble(p.upper())

    if options.loglevel >= 6:
        options.stdlog.write("# peptide original (%5i): %s\n" % (len(p), p))
        options.stdlog.write("# cds original     (%5i): %s\n" % (len(c), c))
        options.stdlog.write("# wobble sequence  (%5i): %s\n" % (len(w), w))
        options.stdlog.flush()

    seq_wobble = alignlib_lite.py_makeSequence(w)
    seq_cds = alignlib_lite.py_makeSequence(c.upper())
    seq_peptide = alignlib_lite.py_makeSequence(p)

    map_p2c = alignlib_lite.py_makeAlignmentVector()

    try:
        AlignCodonBased(seq_wobble,
                        seq_cds,
                        seq_peptide,
                        map_p2c,
                        options=options)
    except ValueError, msg:
        raise ValueError("mapping error for sequence: %s" % (msg))
Пример #4
0
def getMapPeptide2Cds( peptide_sequence, cds_sequence, options ):
    """get map between peptide sequence and cds sequence.
    
    The returned alignment is in nucleotides.

    """
    
    ## remove whitespaces form protein sequence
    p = re.sub(" ", "", peptide_sequence )

    ## remove gaps and whitespaces from cds
    c = re.sub("[ .-]", "", cds_sequence )

    w = Genomics.Protein2Wobble( p.upper() )

    if options.loglevel >= 6:
        options.stdlog.write( "# peptide original (%5i): %s\n" % (len(p), p) )
        options.stdlog.write( "# cds original     (%5i): %s\n" % (len(c), c) )
        options.stdlog.write( "# wobble sequence  (%5i): %s\n" % (len(w), w) )
        options.stdlog.flush()

    seq_wobble = alignlib_lite.py_makeSequence( w )
    seq_cds = alignlib_lite.py_makeSequence( string.upper(c) )
    seq_peptide = alignlib_lite.py_makeSequence( p )

    map_p2c = alignlib_lite.py_makeAlignmentVector()

    try:
        AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options = options )
    except ValueError, msg:
        raise ValueError( "mapping error for sequence: %s" % (msg) )
Пример #5
0
def AlignPair(pair, anchor=0):
    """align a pair of introns."""

    map_intron_a2b = alignlib_lite.py_makeAlignmentVector()

    if param_loglevel >= 1:
        print "# aligning %s-%i with %s-%i: lengths %i and %i" % (
            pair.mToken1, pair.mIntronId1, pair.mToken2, pair.mIntronId2,
            len(pair.mAlignedSequence1), len(pair.mAlignedSequence2))
        sys.stdout.flush()

    s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor
    s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor

    if param_method == "dialigned":
        dialign.Align(s1, s2, map_intron_a2b)
    elif param_method == "dialignedlgs":
        dialignlgs.Align(s1, s2, map_intron_a2b)
    elif param_method == "dbaligned":
        dba.Align(s1, s2, map_intron_a2b)
    elif param_method == "clusaligned":
        raise NotImplementedError("clustalw wrapper not up-to-date")
        clustal.Align(s1, s2, map_intron_a2b)

    if anchor:
        map_intron_a2b.removeRowRegion(
            anchor + len(pair.mAlignedSequence1) + 1,
            map_intron_a2b.getRowTo())
        map_intron_a2b.removeRowRegion(1, anchor)
        map_intron_a2b.removeColRegion(
            anchor + len(pair.mAlignedSequence2) + 1,
            map_intron_a2b.getColTo())
        map_intron_a2b.removeColRegion(1, anchor)
        map_intron_a2b.moveAlignment(-anchor, -anchor)

    if map_intron_a2b.getLength() == 0:
        if param_loglevel >= 1:
            print "# Error: empty intron alignment"
        return False

    seq1 = alignlib_lite.py_makeSequence(pair.mAlignedSequence1)
    seq2 = alignlib_lite.py_makeSequence(pair.mAlignedSequence2)

    data = alignlib_lite.py_AlignmentFormatExplicit(map_intron_a2b, seq1, seq2)

    pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
    pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo
    pair.mMethod = param_method

    pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps(
    ), map_intron_a2b.getLength()
    pair.mAligned = pair.mLength - pair.mNumGaps

    if param_loglevel >= 2:
        print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2

    return True
Пример #6
0
def AlignPair(pair, anchor=0):
    """align a pair of introns."""

    map_intron_a2b = alignlib_lite.py_makeAlignmentVector()

    if param_loglevel >= 1:
        print "# aligning %s-%i with %s-%i: lengths %i and %i" % (pair.mToken1, pair.mIntronId1,
                                                                  pair.mToken2, pair.mIntronId2,
                                                                  len(pair.mAlignedSequence1),
                                                                  len(pair.mAlignedSequence2))
        sys.stdout.flush()

    s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor
    s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor

    if param_method == "dialigned":
        dialign.Align(s1, s2, map_intron_a2b)
    elif param_method == "dialignedlgs":
        dialignlgs.Align(s1, s2, map_intron_a2b)
    elif param_method == "dbaligned":
        dba.Align(s1, s2, map_intron_a2b)
    elif param_method == "clusaligned":
        raise NotImplementedError("clustalw wrapper not up-to-date")
        clustal.Align(s1, s2, map_intron_a2b)

    if anchor:
        map_intron_a2b.removeRowRegion(
            anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo())
        map_intron_a2b.removeRowRegion(1, anchor)
        map_intron_a2b.removeColRegion(
            anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo())
        map_intron_a2b.removeColRegion(1, anchor)
        map_intron_a2b.moveAlignment(-anchor, -anchor)

    if map_intron_a2b.getLength() == 0:
        if param_loglevel >= 1:
            print "# Error: empty intron alignment"
        return False

    seq1 = alignlib_lite.py_makeSequence(pair.mAlignedSequence1)
    seq2 = alignlib_lite.py_makeSequence(pair.mAlignedSequence2)

    data = alignlib_lite.py_AlignmentFormatExplicit(map_intron_a2b, seq1, seq2)

    pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
    pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo
    pair.mMethod = param_method

    pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps(
    ), map_intron_a2b.getLength()
    pair.mAligned = pair.mLength - pair.mNumGaps

    if param_loglevel >= 2:
        print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2

    return True
Пример #7
0
 def GetMap( self ):
     """return map between the two segments."""
     if self.mAlignmentFrom1 and self.mAlignmentFrom2:
         map_a2b = alignlib_lite.py_makeAlignmentVector()
         alignlib_lite.py_AlignmentFormatEmissions( 
             self.mAlignmentFrom1, self.mAlignment1,
             self.mAlignmentFrom2, self.mAlignment2 ).copy( map_a2b )
         return map_a2b
     else:
         return None
Пример #8
0
    def fillFromTable( self, table_row ):

        if len(table_row) == 25:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString) = table_row
        elif len(table_row) == 26:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              self.mNAssembled) = table_row[:26]
        elif len(table_row) > 26:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              self.mNAssembled) = table_row[:26]
        else:
            raise ValueError, "unknown format: %i fields" % len(data)
            sys.exit(0)
            
        if self.mExpand:
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector()

            if self.mQueryAli != "" and self.mSbjctAli != "":
                alignlib_lite.py_AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli,
                                                   self.mSbjctFrom, self.mSbjctAli ).copy( self.mMapPeptide2Translation )

            self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
Пример #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    (options, args) = E.Start(parser)

    iterator = FastaIterator.FastaIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n"
    )

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        map_sequence2mali = alignlib_lite.py_makeAlignmentVector()

        alignlib_lite.py_AlignmentFormatExplicit(0, sequence, 0, "X" *
                                                 l).copy(map_sequence2mali)

        options.stdout.write("\t".join(
            (cur_record.title, "ref",
             str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) +
                             "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Пример #10
0
def Alignment2CDNA(alignment,
                   query_from=0,
                   sbjct_from=0,
                   genome=None,
                   remove_frameshifts=0):
    """build cDNA sequence from genomic fragment and
    return alignment of query to it.
    """

    fragments = []
    sbjct_pos = 0
    map_query2sbjct = alignlib_lite.py_makeAlignmentVector()

    # count in nucleotides for query
    query_pos = query_from * 3
    sbjct_pos = sbjct_from
    # position in cDNA
    cdna_pos = 0
    for state, l_query, l_sbjct in alignment:

        # count as nucleotides
        l_query *= 3

        keep = False

        if state == "M":
            keep = True
        elif state == "S":
            l_query = l_sbjct
            keep = True
        elif state == "F" and not remove_frameshifts:
            keep = True
        elif state == "G":
            if l_sbjct > 0:
                keep = True
        elif state == "P":
            keep = False

        if keep:
            if genome:
                fragments.append(genome[sbjct_pos:sbjct_pos + l_sbjct])

            if l_query > 0 and l_sbjct > 0:
                alignlib_lite.py_addDiagonal2Alignment(map_query2sbjct,
                                                       query_pos,
                                                       query_pos + l_query,
                                                       cdna_pos - query_pos)
            cdna_pos += l_sbjct

        query_pos += l_query
        sbjct_pos += l_sbjct

    return map_query2sbjct, fragments.join("")
Пример #11
0
def Alignment2CDNA(alignment,
                   query_from=0,
                   sbjct_from=0,
                   genome=None,
                   remove_frameshifts=0):
    """build cDNA sequence from genomic fragment and
    return alignment of query to it.
    """

    fragments = []
    sbjct_pos = 0
    map_query2sbjct = alignlib_lite.py_makeAlignmentVector()

    # count in nucleotides for query
    query_pos = query_from * 3
    sbjct_pos = sbjct_from
    # position in cDNA
    cdna_pos = 0
    for state, l_query, l_sbjct in alignment:

        # count as nucleotides
        l_query *= 3

        keep = False

        if state == "M":
            keep = True
        elif state == "S":
            l_query = l_sbjct
            keep = True
        elif state == "F" and not remove_frameshifts:
            keep = True
        elif state == "G":
            if l_sbjct > 0:
                keep = True
        elif state == "P":
            keep = False

        if keep:
            if genome:
                fragments.append(genome[sbjct_pos:sbjct_pos + l_sbjct])

            if l_query > 0 and l_sbjct > 0:
                alignlib_lite.py_addDiagonal2Alignment(map_query2sbjct,
                                                       query_pos,
                                                       query_pos + l_query,
                                                       cdna_pos - query_pos)
            cdna_pos += l_sbjct

        query_pos += l_query
        sbjct_pos += l_sbjct

    return map_query2sbjct, fragments.join("")
Пример #12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"])

    parser.set_defaults(
    )

    (options, args) = E.Start(parser)

    iterator = FastaIterator.FastaIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n")

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        map_sequence2mali = alignlib_lite.py_makeAlignmentVector()

        alignlib_lite.py_AlignmentFormatExplicit(0, sequence,
                                                 0, "X" * l).copy(map_sequence2mali)

        options.stdout.write("\t".join((
            cur_record.title,
            "ref",
            str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) + "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped))

    E.Stop()
Пример #13
0
    def getCopy(self):
        """return a new copy.
        """

        new_entry = Prediction()

        new_entry.mExpand = self.mExpand

        new_entry.mPredictionId = self.mPredictionId
        new_entry.mQueryToken = self.mQueryToken
        new_entry.mQueryFrom = self.mQueryFrom
        new_entry.mQueryTo = self.mQueryTo
        new_entry.mSbjctToken = self.mSbjctToken
        new_entry.mSbjctStrand = self.mSbjctStrand
        new_entry.mSbjctFrom = self.mSbjctFrom
        new_entry.mSbjctTo = self.mSbjctTo
        new_entry.mRank = self.mRank
        new_entry.score = self.score
        new_entry.mQueryLength = self.mQueryLength
        new_entry.mQueryCoverage = self.mQueryCoverage
        new_entry.mNGaps = self.mNGaps
        new_entry.mNFrameShifts = self.mNFrameShifts
        new_entry.mNIntrons = self.mNIntrons
        new_entry.mNSplits = self.mNSplits
        new_entry.mNStopCodons = self.mNStopCodons
        new_entry.mPercentIdentity = self.mPercentIdentity
        new_entry.mPercentSimilarity = self.mPercentSimilarity
        new_entry.mTranslation = self.mTranslation
        new_entry.mSbjctGenomeFrom = self.mSbjctGenomeFrom
        new_entry.mSbjctGenomeTo = self.mSbjctGenomeTo
        new_entry.mAlignmentString = self.mAlignmentString
        new_entry.mQueryAli = self.mQueryAli
        new_entry.mSbjctAli = self.mSbjctAli

        if self.mExpand:
            new_entry.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector(
            )
            alignlib_lite.py_copyAlignment(new_entry.mMapPeptide2Translation,
                                           self.mMapPeptide2Translation)
            new_entry.mMapPeptide2Genome = Genomics.String2Alignment(
                new_entry.mAlignmentString)
        else:
            new_entry.mMapPeptide2Translation = self.mMapPeptide2Translation = None
            new_entry.mMapPeptide2Genome = self.mMapPeptide2Genome = None

        return new_entry
Пример #14
0
    def fillFromTable(self, table_row):

        if len(table_row) == 25:
            (self.mPredictionId, self.mQueryToken, self.mSbjctToken,
             self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom,
             self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo,
             self.mSbjctAli, self.mQueryLength, self.mQueryCoverage,
             self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits,
             self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity,
             self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
             self.mAlignmentString) = table_row
        elif len(table_row) == 26:
            (self.mPredictionId, self.mQueryToken, self.mSbjctToken,
             self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom,
             self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo,
             self.mSbjctAli, self.mQueryLength, self.mQueryCoverage,
             self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits,
             self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity,
             self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
             self.mAlignmentString, self.mNAssembled) = table_row[:26]
        elif len(table_row) > 26:
            (self.mPredictionId, self.mQueryToken, self.mSbjctToken,
             self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom,
             self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo,
             self.mSbjctAli, self.mQueryLength, self.mQueryCoverage,
             self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits,
             self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity,
             self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
             self.mAlignmentString, self.mNAssembled) = table_row[:26]
        else:
            raise ValueError, "unknown format: %i fields" % len(data)
            sys.exit(0)

        if self.mExpand:
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector(
            )

            if self.mQueryAli != "" and self.mSbjctAli != "":
                alignlib_lite.py_AlignmentFormatEmissions(
                    self.mQueryFrom, self.mQueryAli, self.mSbjctFrom,
                    self.mSbjctAli).copy(self.mMapPeptide2Translation)

            self.mMapPeptide2Genome = Genomics.String2Alignment(
                self.mAlignmentString)
Пример #15
0
    def getCopy( self ):
        """return a new copy.
        """

        new_entry = Prediction()

        new_entry.mExpand = self.mExpand 
        
        new_entry.mPredictionId = self.mPredictionId 
        new_entry.mQueryToken = self.mQueryToken 
        new_entry.mQueryFrom = self.mQueryFrom 
        new_entry.mQueryTo = self.mQueryTo 
        new_entry.mSbjctToken = self.mSbjctToken 
        new_entry.mSbjctStrand = self.mSbjctStrand 
        new_entry.mSbjctFrom = self.mSbjctFrom 
        new_entry.mSbjctTo = self.mSbjctTo 
        new_entry.mRank = self.mRank 
        new_entry.score = self.score 
        new_entry.mQueryLength = self.mQueryLength 
        new_entry.mQueryCoverage = self.mQueryCoverage 
        new_entry.mNGaps = self.mNGaps 
        new_entry.mNFrameShifts = self.mNFrameShifts 
        new_entry.mNIntrons = self.mNIntrons 
        new_entry.mNSplits = self.mNSplits 
        new_entry.mNStopCodons = self.mNStopCodons 
        new_entry.mPercentIdentity = self.mPercentIdentity 
        new_entry.mPercentSimilarity = self.mPercentSimilarity 
        new_entry.mTranslation = self.mTranslation 
        new_entry.mSbjctGenomeFrom = self.mSbjctGenomeFrom 
        new_entry.mSbjctGenomeTo = self.mSbjctGenomeTo 
        new_entry.mAlignmentString = self.mAlignmentString 
        new_entry.mQueryAli = self.mQueryAli 
        new_entry.mSbjctAli = self.mSbjctAli 

        if self.mExpand:
            new_entry.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector()
            alignlib_lite.py_copyAlignment( new_entry.mMapPeptide2Translation, self.mMapPeptide2Translation)
            new_entry.mMapPeptide2Genome = Genomics.String2Alignment( new_entry.mAlignmentString) 
        else:
            new_entry.mMapPeptide2Translation = self.mMapPeptide2Translation = None
            new_entry.mMapPeptide2Genome = self.mMapPeptide2Genome = None

        return new_entry
Пример #16
0
    def __init__(self, expand=1):

        self.mExpand = expand

        self.mPredictionId = 0
        self.mQueryToken = 0
        self.mQueryFrom = 0
        self.mQueryTo = 0
        self.mSbjctToken = 0
        self.mSbjctStrand = 0
        self.mSbjctFrom = 0
        self.mSbjctTo = 0
        self.mRank = 0
        self.score = 0
        self.mQueryLength = 0
        self.mQueryCoverage = 0
        self.mNGaps = 0
        self.mNFrameShifts = 0
        self.mNIntrons = 0
        self.mNSplits = 0
        self.mNStopCodons = 0
        self.mPercentIdentity = 0
        self.mPercentSimilarity = 0
        self.mTranslation = ""
        self.mSbjctGenomeFrom = 0
        self.mSbjctGenomeTo = 0
        self.mAlignmentString = ""
        self.mQueryAli = ""
        self.mSbjctAli = ""

        if self.mExpand:
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector(
            )
            self.mMapPeptide2Genome = []
        else:
            self.mMapPeptide2Translation = None
            self.mMapPeptide2Genome = None
        self.mNAssembled = 0
Пример #17
0
    def __init__(self, expand = 1):

        self.mExpand = expand
        
        self.mPredictionId = 0
        self.mQueryToken = 0
        self.mQueryFrom = 0
        self.mQueryTo = 0
        self.mSbjctToken = 0
        self.mSbjctStrand = 0
        self.mSbjctFrom = 0
        self.mSbjctTo = 0
        self.mRank = 0
        self.score = 0
        self.mQueryLength = 0
        self.mQueryCoverage = 0
        self.mNGaps = 0
        self.mNFrameShifts = 0
        self.mNIntrons = 0
        self.mNSplits = 0
        self.mNStopCodons = 0
        self.mPercentIdentity = 0
        self.mPercentSimilarity = 0
        self.mTranslation = ""
        self.mSbjctGenomeFrom = 0
        self.mSbjctGenomeTo = 0
        self.mAlignmentString = ""
        self.mQueryAli = ""
        self.mSbjctAli = ""
        
        if self.mExpand:
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector()
            self.mMapPeptide2Genome = []
        else:
            self.mMapPeptide2Translation = None
            self.mMapPeptide2Genome = None
        self.mNAssembled = 0
Пример #18
0
            continue

        if pair.mMethod == "unaligned":
            unaligned_pair = pair
            pair.mType1 = GetIntronType(unaligned_pair.mAlignedSequence1)
            pair.mType2 = GetIntronType(unaligned_pair.mAlignedSequence2)
            do_print = param_echo_unaligned
        else:
            do_print = 1
            if param_is_compressed:
                if unaligned_pair and \
                        unaligned_pair.mToken1 == pair.mToken1 and \
                        unaligned_pair.mToken2 == pair.mToken2 and \
                        unaligned_pair.mIntronId1 == pair.mIntronId1:

                    map_a2b = alignlib_lite.py_makeAlignmentVector()
                    f = AlignmentFormatEmissions(
                        pair.mFrom1,
                        pair.mAlignedSequence1,
                        pair.mFrom2,
                        pair.mAlignedSequence2).copy(map_a2b)
                    map_a2b.moveAlignment(-unaligned_pair.mFrom1 +
                                          1, -unaligned_pair.mFrom2 + 1)

                    data = alignlib_lite.py_AlignmentFormatExplicit(map_a2b,
                                                                    alignlib_lite.py_makeSequence(
                                                                        unaligned_pair.mAlignedSequence1),
                                                                    alignlib_lite.py_makeSequence(unaligned_pair.mAlignedSequence2))

                    from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
                    from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo
Пример #19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("--methods",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("summary-numbers", "jalview",
                               "positive-site-table", "positive-site-list",
                               "count-positive-sites"),
                      help="methods for analysis.")

    parser.add_option("--selection-mode",
                      dest="selection_mode",
                      type="choice",
                      choices=("all", "consistent", "emes"),
                      help="how to select positive sites.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix for rows.")

    parser.add_option("--pattern-input-filenames",
                      dest="pattern_input_filenames",
                      type="string",
                      help="input pattern.")

    parser.add_option(
        "--filter-probability",
        dest="filter_probability",
        type="float",
        help=
        "threshold for probability above which to include positive sites [default=%default]."
    )

    parser.add_option(
        "--filter-omega",
        dest="filter_omega",
        type="float",
        help=
        "threshold for omega above which to include positive sites [default=%default]."
    )

    parser.add_option("--models",
                      dest="models",
                      type="string",
                      help="restrict output to set of site specific models.")

    parser.add_option("--analysis",
                      dest="analysis",
                      type="string",
                      help="restrict output to set of analysis [beb|neb].")

    parser.add_option("--significance-threshold",
                      dest="significance_threshold",
                      type="float",
                      help="significance threshold for log-likelihood test.")

    parser.add_option("--filter-mali",
                      dest="filter_mali",
                      type="choice",
                      choices=("none", "gaps"),
                      help="filter by mali to remove gapped positions.")

    parser.add_option(
        "--filename-mali",
        dest="filename_mali",
        type="string",
        help=
        "filename with multiple alignment used for calculating sites - used for filtering"
    )

    parser.add_option(
        "--filename-map-mali",
        dest="filename_map_mali",
        type="string",
        help="filename with multiple alignment to map sites onto.")

    parser.add_option(
        "--jalview-titles",
        dest="jalview_titles",
        type="string",
        help="comma separated list of jalview annotation titles.")

    parser.add_option("--jalview-symbol",
                      dest="jalview_symbol",
                      type="string",
                      help="symbol to use in jalview.")

    parser.set_defaults(
        methods=[],
        prefix=None,
        filter_probability=0,
        filter_omega=0,
        models="",
        analysis="",
        significance_threshold=0.05,
        selection_mode="consistent",
        filename_mali=None,
        filename_map_mali=None,
        jalview_symbol="*",
        jalview_titles="",
        filter_mali=None,
    )

    (options, args) = E.Start(parser)

    if options.jalview_titles:
        options.jalview_titles = options.jalview_titles.split(",")
    else:
        options.jalview_titles = args

    options.models = options.models.split(",")
    options.analysis = options.analysis.split(",")

    for a in options.analysis:
        if a not in ("beb", "neb"):
            raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a

    for a in options.models:
        if a not in ("8", "2", "3"):
            raise "unknown model: '%s', possible values are 2, 3, 8" % a

    codeml = WrapperCodeML.CodeMLSites()

    ## filter and extract functions
    filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega
    extract_f = lambda x: x.mResidue

    ## read multiple results
    results = []
    ninput, noutput, nskipped = 0, 0, 0

    headers = []
    for f in args:
        ninput += 1
        try:
            results.append(codeml.parseOutput(open(f, "r").readlines()))
        except WrapperCodeML.UsageError:
            if options.loglevel >= 1:
                options.stdlog.write("# no input from %s\n" % f)
            nskipped += 1
            continue
        noutput += 1
        headers.append(f)

    ## map of nested model (key) to more general model
    map_nested_models = {'8': '7', '2': '1', '3': '0'}

    if options.filename_mali:
        mali = Mali.Mali()
        mali.readFromFile(open(options.filename_mali, "r"))
    else:
        mali = None

    ###############################################################
    ###############################################################
    ###############################################################
    ## use multiple alignment to map residues to a reference mali
    ## or a sequence.
    ###############################################################
    if options.filename_map_mali:

        if not mali:
            raise "please supply the input multiple alignment, if residues are to be mapped."

        ## translate the alignments
        def translate(s):
            sequence = s.mString
            seq = []
            for codon in [
                    sequence[x:x + 3] for x in range(0, len(sequence), 3)
            ]:
                aa = Genomics.MapCodon2AA(codon)
                seq.append(aa)

            s.mString = "".join(seq)

        tmali = Mali.Mali()
        tmali.readFromFile(open(options.filename_mali, "r"))
        tmali.apply(translate)

        tmap_mali = Mali.Mali()
        tmap_mali.readFromFile(open(options.filename_map_mali, "r"))

        if tmap_mali.getAlphabet() == "na":
            tmap_mali.apply(translate)

        map_old2new = alignlib_lite.py_makeAlignmentVector()

        mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali))

        if tmap_mali.getLength() == 1:

            s = tmap_mali.values()[0].mString
            mali2 = alignlib_lite.py_makeSequence(s)
            ## see if you can find an identical subsequence and then align to thisD
            for x in tmali.values():
                if s in re.sub("[- .]+", "", x.mString):
                    mali1 = alignlib_lite.py_makeSequence(x.mString)
                    break
        else:
            mali2 = alignlib_lite.py_makeProfileFromMali(
                convertMali2Mali(tmap_mali))

        alignator = alignlib_lite.py_makeAlignatorDPFull(
            alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0)
        alignator.align(map_old2new, mali1, mali2)

        consensus = tmap_mali.getConsensus()

        if options.loglevel >= 4:
            options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet())
            options.stdlog.write("# orig  : %s\n" % tmali.getConsensus())
            options.stdlog.write("# mapped: %s\n" % consensus)
            options.stdlog.write("# alignment: %s\n" % map_old2new.Write())
    else:
        map_old2new = None

    for method in options.methods:

        if method == "summary-numbers":

            options.stdlog.write( \
"""# Numbers of positive sites.
#
# The consistent row/column contains positive sites that are significant
# (above thresholds for probability and omega) for all models/analysis
# that have been selected (label: cons).
#
# The log-likelihood ratio test is performed for model pairs, depending
# on the output chosen.
# Significance threshold: %6.4f
# The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0.
#
""" % options.significance_threshold )

            ## write header
            if options.prefix: options.stdout.write("prefix\t")

            options.stdout.write("method\tnseq\t")
            h = []
            for model in options.models:
                for analysis in options.analysis:
                    h.append("%s%s" % (analysis, model))
                h.append("p%s" % (model))
                h.append("df%s" % (model))
                h.append("chi%s" % (model))
                h.append("lrt%s" % (model))

            options.stdout.write("\t".join(h))
            options.stdout.write("\tcons\tpassed\tfilename\n")

            nmethod = 0

            consistent_cols = [None for x in range(len(options.analysis))]
            passed_tests = {}
            for m in options.models:
                passed_tests[m] = 0

            for result in results:

                row_consistent = None

                if options.prefix:
                    options.stdout.write("%s" % (options.prefix))

                options.stdout.write("%i" % nmethod)
                options.stdout.write("\t%i" % (result.mNumSequences))

                npassed = 0

                for model in options.models:

                    sites = result.mSites[model]

                    ## do significance test
                    full_model, null_model = model, map_nested_models[model]

                    lrt = Stats.doLogLikelihoodTest(
                        result.mSites[full_model].mLogLikelihood,
                        result.mSites[full_model].mNumParameters,
                        result.mSites[null_model].mLogLikelihood,
                        result.mSites[null_model].mNumParameters,
                        options.significance_threshold)

                    x = 0
                    for analysis in options.analysis:

                        if analysis == "neb":
                            s = set(
                                map(
                                    extract_f,
                                    filter(filter_f,
                                           sites.mNEB.mPositiveSites)))

                        elif analysis == "beb":
                            s = set(
                                map(
                                    extract_f,
                                    filter(filter_f,
                                           sites.mBEB.mPositiveSites)))

                        options.stdout.write("\t%i" % (len(s)))

                        if not lrt.mPassed:
                            s = set()

                        if row_consistent == None:
                            row_consistent = s
                        else:
                            row_consistent = row_consistent.intersection(s)

                        if consistent_cols[x] == None:
                            consistent_cols[x] = s
                        else:
                            consistent_cols[x] = consistent_cols[
                                x].intersection(s)

                        x += 1

                    if lrt.mPassed:
                        c = "passed"
                        passed_tests[model] += 1
                        npassed += 1
                    else:
                        c = "failed"

                    options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" %\
                                         (lrt.mProbability,
                                          lrt.mDegreesFreedom,
                                          lrt.mChiSquaredValue,
                                          c))

                options.stdout.write(
                    "\t%i\t%i\t%s\n" %
                    (len(row_consistent), npassed, headers[nmethod]))

                nmethod += 1

            if options.prefix:
                options.stdout.write("%s\t" % options.prefix)

            options.stdout.write("cons")

            row_consistent = None
            total_passed = 0
            for model in options.models:

                x = 0

                for analysis in options.analysis:

                    s = consistent_cols[x]
                    if s == None:
                        s = set()

                    options.stdout.write("\t%i" % (len(s)))

                    if row_consistent == None:
                        row_consistent = s
                    else:
                        row_consistent = row_consistent.intersection(s)

                    x += 1

                options.stdout.write("\tna\t%i" % passed_tests[model])
                total_passed += passed_tests[model]

            options.stdout.write("\t%i\t%i\n" %
                                 (len(row_consistent), total_passed))

        elif method == "jalview":

            options.stdout.write("JALVIEW_ANNOTATION\n")
            options.stdout.write("# Created: %s\n\n" %
                                 (time.asctime(time.localtime(time.time()))))

            l = 1
            x = 0
            for result in results:

                sites, significance = selectPositiveSites(
                    [result], options.selection_mode, options, mali)

                codes = [""] * result.mLength

                if len(sites) == 0: continue

                for site in sites:
                    codes[site - 1] = options.jalview_symbol

                options.stdout.write(
                    "NO_GRAPH\t%s\t%s\n" %
                    (options.jalview_titles[x], "|".join(codes)))
                x += 1

        elif method == "count-positive-sites":

            sites, significance = selectPositiveSites(results,
                                                      options.selection_mode,
                                                      options, mali)

            options.stdout.write("%i\n" % (len(sites)))

        elif method in ("positive-site-table", ):

            sites, significance = selectPositiveSites(results,
                                                      options.selection_mode,
                                                      options, mali)

            headers = ["site", "P"]
            if map_old2new:
                headers.append("mapped")
                headers.append("Pm")

            options.stdout.write("\t".join(headers) + "\n")

            sites = list(sites)
            sites.sort()
            nmapped, nunmapped = 0, 0
            for site in sites:
                values = [site, "%6.4f" % significance[site]]

                if map_old2new:
                    r = map_old2new.mapRowToCol(site)
                    if r == 0:
                        values.append("na")
                        values.append("")
                        nunmapped += 1
                        if options.loglevel >= 2:
                            options.stdlog.write("# unmapped residue: %i\n" %
                                                 site)
                    else:
                        values.append(r)
                        values.append(consensus[r - 1])
                        nmapped += 1

                options.stdout.write("\t".join(map(str, (values))) + "\n")

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sites: ninput=%i, noutput=%i, nskipped=%i\n" %
                    (len(sites), nmapped, nunmapped))

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
Пример #20
0
    def read( self, line ):

        data = string.split( line[:-1], "\t")
        
        if len(data) == 26:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString, self.mNAssembled,
              ) = data
        elif len(data) == 25:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              ) = data
        elif len(data) == 24:            
            ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              ) = data
        elif len(data) == 23:
            ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              ) = data
            self.mAlignmentString = ""
        else:
            raise ValueError, "unknown format: %i fields in line %s" % (len(data), line[:-1])

        (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity) = map (\
            float, (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity))

        (self.mPredictionId, 
         self.mQueryFrom, self.mQueryTo, self.mQueryLength,
         self.mSbjctFrom, self.mSbjctTo,
         self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
         self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons,
         self.mNFrameShifts, self.mNAssembled) = map (\
            int, ( self.mPredictionId,
                   self.mQueryFrom, self.mQueryTo, self.mQueryLength,
                   self.mSbjctFrom, self.mSbjctTo,
                   self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
                   self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons,
                   self.mNFrameShifts, self.mNAssembled))

        if self.mExpand:        
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector()

            if self.mQueryAli != "" and self.mSbjctAli != "":
                
                alignlib_lite.py_AlignmentFormatExplicit(
                    self.mQueryFrom, self.mQueryAli,
                    self.mSbjctFrom, self.mSbjctAli).copy( self.mMapPeptide2Translation )

            self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
Пример #21
0
def getMapPeptide2Cds(peptide_sequence, cds_sequence, options):
    """get map between peptide sequence and cds sequence.

    The returned alignment is in nucleotides.

    """

    # remove whitespaces form protein sequence
    p = re.sub(" ", "", peptide_sequence)

    # remove gaps and whitespaces from cds
    c = re.sub("[ .-]", "", cds_sequence)

    w = Genomics.Protein2Wobble(p.upper())

    if options.loglevel >= 6:
        options.stdlog.write("# peptide original (%5i): %s\n" % (len(p), p))
        options.stdlog.write("# cds original     (%5i): %s\n" % (len(c), c))
        options.stdlog.write("# wobble sequence  (%5i): %s\n" % (len(w), w))
        options.stdlog.flush()

    seq_wobble = alignlib_lite.py_makeSequence(w)
    seq_cds = alignlib_lite.py_makeSequence(c.upper())
    seq_peptide = alignlib_lite.py_makeSequence(p)

    map_p2c = alignlib_lite.py_makeAlignmentVector()

    try:
        AlignCodonBased(seq_wobble,
                        seq_cds,
                        seq_peptide,
                        map_p2c,
                        options=options)
    except ValueError as msg:
        raise ValueError("mapping error for sequence: %s" % (msg))

    # if there are more than five frameshifts - do exhaustive alignment
    max_gaps = 5
    num_peptide_gaps = len(re.sub("[^-]", "", p))
    ngaps = map_p2c.getNumGaps() - \
        (num_peptide_gaps * 3) - abs(len(w) - len(c))

    if options.loglevel >= 6:
        options.stdlog.write(
            "# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n"
            % (ngaps, num_peptide_gaps))
        printPrettyAlignment(seq_wobble, seq_cds, p, map_p2c, options)

    if ngaps > max_gaps:
        if options.loglevel >= 2:
            options.stdlog.write(
                "# too many gaps (%i>%i), realigning exhaustively.\n" %
                (ngaps, max_gaps))
            options.stdlog.flush()
        full_map_p2c = alignlib_lite.py_makeAlignmentVector()

        AlignExhaustive(seq_wobble, seq_cds, seq_peptide, full_map_p2c,
                        options)
        if options.loglevel >= 6:
            options.stdlog.write("# full alignment between wobble and cds:\n")
            options.stdlog.flush()
            printPrettyAlignment(seq_wobble, seq_cds, p, full_map_p2c, options)

        map_p2c = full_map_p2c

    # remove incomplete codons
    x = 0
    while x < len(p) * 3:
        if (map_p2c.mapRowToCol(x) < 0 or map_p2c.mapRowToCol(x + 1) < 0
                or map_p2c.mapRowToCol(x + 2) < 0):
            map_p2c.removeRowRegion(x, x + 3)
        x += 3

    if map_p2c.getLength() == 0:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: empty alignment\n")
            if options.loglevel >= 6:
                options.stdlog.write("# peptide original: %s\n" % p)
                options.stdlog.write("# cds original    : %s\n" % c)
                options.stdlog.write("# wobble sequence : %s\n" % w)

        raise ValueError("empty alignment")

    assert (map_p2c.getRowTo() <= seq_wobble.getLength())
    assert (map_p2c.getColTo() <= seq_cds.getLength())

    return map_p2c
Пример #22
0
    ngaps = map_p2c.getNumGaps() - \
        (num_peptide_gaps * 3) - abs(len(w) - len(c))

    if options.loglevel >= 6:
        options.stdlog.write(
            "# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n"
            % (ngaps, num_peptide_gaps))
        printPrettyAlignment(seq_wobble, seq_cds, p, map_p2c, options)

    if ngaps > max_gaps:
        if options.loglevel >= 2:
            options.stdlog.write(
                "# too many gaps (%i>%i), realigning exhaustively.\n" %
                (ngaps, max_gaps))
            options.stdlog.flush()
        full_map_p2c = alignlib_lite.py_makeAlignmentVector()

        AlignExhaustive(seq_wobble, seq_cds, seq_peptide, full_map_p2c,
                        options)
        if options.loglevel >= 6:
            options.stdlog.write("# full alignment between wobble and cds:\n")
            options.stdlog.flush()
            printPrettyAlignment(seq_wobble, seq_cds, p, full_map_p2c, options)

        map_p2c = full_map_p2c

    # remove incomplete codons
    x = 0
    while x < len(p) * 3:
        if (map_p2c.mapRowToCol(x) < 0 or map_p2c.mapRowToCol(x + 1) < 0
                or map_p2c.mapRowToCol(x + 2) < 0):
Пример #23
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--correct-gap-shift",
                      dest="correct_shift",
                      action="store_true",
                      help="correct gap length shifts in alignments. "
                      "Requires alignlib_lite.py [%default]")

    parser.add_option(
        "-1",
        "--pattern1",
        dest="pattern1",
        type="string",
        help="pattern to extract identifier from in identifiers1. "
        "[%default]")

    parser.add_option(
        "-2",
        "--pattern2",
        dest="pattern2",
        type="string",
        help="pattern to extract identifier from in identifiers2. "
        "[%default]")

    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("diff", "missed", "seqdiff"),
                      help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.Start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(IOTools.openFile(args[0], "r"))
    ])
    seqs2 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(IOTools.openFile(args[1], "r"))
    ])

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in seqs1:
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences: the first and last residues
                    # can be different for peptide sequences when
                    # comparing my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len(
                            filter(lambda x: x[0] == "U" or x[1] == "U",
                                   differences)) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len(
                            filter(lambda x: x[0] in "NX" or x[1] in "NX",
                                   differences)) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if options.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (
                            k, x, a, b, len(s1), len(s2))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print "fix\t%s\t%s" % (k, str(f))

                if not keep:
                    print "# warning: not fixable: %s" % k

            if write_diff:
                options.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in seqs2.keys():
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    options.stdlog.write("""# Legend:
# seqs1:          number of sequences in set 1
# seqs2:          number of sequences in set 2
# same:           number of identical sequences
# diff:           number of sequences with differences
# nmissed1:       sequences in set 1 that are not found in set 2
# nmissed2:       sequences in set 2 that are not found in set 1
# Type of sequence differences
# first:          only the first residue is different
# last:           only the last residue is different
# prefix:         one sequence is prefix of the other
# selenocysteine: difference due to selenocysteines
# masked:         difference due to masked residues
# fixed:          fixed differences
# other:          other differences
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))

    E.info(
        "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i"
        % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine,
           ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last -
           ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed))

    E.Stop()
Пример #24
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--sequences",
                      dest="filename_sequences",
                      type="string",
                      help="peptide sequence [Default=%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output format [Default=%default]")

    parser.add_option(
        "-e",
        "--expand",
        dest="expand",
        action="store_true",
        help=
        "expand positions from peptide to nucleotide alignment [Default=%default]"
    )

    parser.add_option("-m",
                      "--map",
                      dest="filename_map",
                      type="string",
                      help="map alignments [Default=%default]")

    parser.add_option("-c",
                      "--codons",
                      dest="require_codons",
                      action="store_true",
                      help="require codons [Default=%default]")

    parser.add_option(
        "--one-based-coordinates",
        dest="one_based_coordinates",
        action="store_true",
        help=
        "expect one-based coordinates. The default are zero based coordinates [Default=%default]."
    )

    parser.add_option("--no-identical",
                      dest="no_identical",
                      action="store_true",
                      help="do not output identical pairs [Default=%default]")

    parser.add_option(
        "-g",
        "--no-gaps",
        dest="no_gaps",
        action="store_true",
        help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option("-x",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exon boundaries [Default=%default]")

    parser.add_option("-o",
                      "--outfile",
                      dest="filename_outfile",
                      type="string",
                      help="filename to save links [Default=%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of alignment [Default=%default]")

    parser.add_option(
        "--filter",
        dest="filename_filter",
        type="string",
        help=
        "given a set of previous alignments, only write new pairs [Default=%default]."
    )

    parser.set_defaults(filename_sequences=None,
                        filename_exons=None,
                        filename_map=None,
                        filename_outfile=None,
                        no_gaps=False,
                        format="fasta",
                        expand=False,
                        require_codons=False,
                        no_identical=False,
                        min_length=0,
                        report_step=100,
                        one_based_coordinates=False,
                        filename_filter=None)

    (options, args) = E.Start(parser, add_mysql_options=True)

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences(
            open(options.filename_sequences, "r"))
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i sequences\n" % len(sequences))
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"))
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i exons\n" % len(exons))
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#":
                continue
            m = Map()
            m.read(line)
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i maps\n" % len(map_old2new))
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:
            options.stdlog.write("# reading filtering information.\n")
            sys.stdout.flush()

        map_pair2hids = {}

        if os.path.exists(options.filename_filter):

            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator(infile)

            while 1:
                cur_record = iterator.next()
                if cur_record is None:
                    break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None:
                    break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids:
                    map_pair2hids[id] = []

                map_pair2hids[id].append(s)

            infile.close()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# read filtering information for %i pairs.\n" %
                len(map_pair2hids))
            sys.stdout.flush()
    else:
        map_pair2hids = None

    if options.loglevel >= 1:
        options.stdlog.write("# finished input in %i seconds.\n" %
                             (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None

    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links(sys.stdin):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write("# iterations: %i in %i seconds.\n" %
                                     (iterations, time.time() - t1))
                sys.stdout.flush()

        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write("# read link %s\n" % str(link))

        row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken])
        col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken])

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment(link.mQueryAli, 3)
            link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3)

        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli, link.mSbjctFrom,
            link.mSbjctAli).copy(map_row2col)

        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in row with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mQueryToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New,
                map_row2col, alignlib_lite.py_RR)
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in col with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mSbjctToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_row2col,
                map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR)
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        dr = row_seq.getLength() - map_row2col.getRowTo()
        dc = col_seq.getLength() - map_row2col.getColTo()
        if dr < 0 or dc < 0:
            raise ValueError(
                "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s"
                %
                (link.mQueryToken, link.mSbjctToken, row_seq.getLength(),
                 col_seq.getLength(),
                 str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))

        if options.loglevel >= 2:
            options.stdlog.write(
                str(
                    alignlib_lite.py_AlignmentFormatExplicit(
                        map_row2col, row_seq, col_seq)) + "\n")
        # check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()

            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write("# %s\n" % str(map_row2col))
                options.stdlog.write("# %s\n" % str(link))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mQueryToken]))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mSbjctToken]))
                options.stdlog.write("#\n%s\n" %
                                     alignlib_lite.py_AlignmentFormatExplicit(
                                         map_row2col, row_seq, col_seq))

                raise ValueError(
                    "incomplete codons %i in pair %s - %s" %
                    (naligned, link.mQueryToken, link.mSbjctToken))

        # if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            # Get overlapping segments
            segments = Exons.MatchExons(map_row2col, exons1, exons2)

            for a, b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in
                # the input files.

                from1, to1 = GetAdjustedBoundaries(a, exons1)
                from2, to2 = GetAdjustedBoundaries(b, exons2)

                alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col,
                                               from1 + 1, to1, from2 + 1, to2)

                mode = Write(tmp1_map_row2col,
                             row_seq,
                             col_seq,
                             link,
                             no_gaps=options.no_gaps,
                             no_identical=options.no_identical,
                             min_length=options.min_length,
                             suffix1="_%s" % str(a),
                             suffix2="_%s" % str(b),
                             outfile=outfile,
                             pair_filter=map_pair2hid,
                             format=options.format)

                if mode not in counts:
                    counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write(map_row2col,
                         row_seq,
                         col_seq,
                         link,
                         min_length=options.min_length,
                         no_gaps=options.no_gaps,
                         no_identical=options.no_identical,
                         outfile=outfile,
                         pair_filter=map_pair2hids,
                         format=options.format)

            if mode not in counts:
                counts[mode] = 0
            counts[mode] += 1

        noutput += 1

    if outfile:
        outfile.close()

    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join(
            map(lambda x, y: "%s=%i" %
                (x, y), counts.keys(), counts.values())))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Пример #25
0
def Alignment2DNA(alignment, query_from=0, sbjct_from=0):
    """convert a peptide2genome alignment to a nucleotide2nucleotide
    alignment.

    Instead of peptide coordinates, the alignment will be
    in codon coordinates.

    Arguments
    ---------
    aligment : list
        List of tuples of the alignment in CIGAR format.
    query_from : int
        Start position of alignment on peptide sequence.
    sbjct_from : int
        Start position of alignment on nucleotide sequence.

    Returns
    -------
    alignment : object
       The alignment as an alignlib.AlignmentVector object.
    """

    map_query2sbjct = alignlib_lite.py_makeAlignmentVector()

    # count in nucleotides for query
    query_pos = query_from * 3
    sbjct_pos = sbjct_from

    for state, l_query, l_sbjct in alignment:

        # count as nucleotides
        l_query *= 3

        if state in ("A", "B", "C"):

            if state in ("A"):
                l_query = 0
            elif state in ("B"):
                l_query = 1
            elif state in ("C"):
                l_query = 2

        elif state in ("a", "b", "c"):

            if state in ("a"):
                l_query = 0
            elif state in ("b"):
                l_query = 2
            elif state in ("c"):
                l_query = 1

        elif state == "S":
            l_query = l_sbjct

        if l_query > 0 and l_sbjct > 0:
            alignlib_lite.addDiagonal2Alignment(map_query2sbjct, query_pos,
                                                query_pos + l_query,
                                                sbjct_pos - query_pos)

        query_pos += l_query
        sbjct_pos += l_sbjct

    return map_query2sbjct
Пример #26
0
def Alignment2PeptideAlignment(alignment,
                               query_from=0,
                               sbjct_from=0,
                               genomic_sequence=None):
    """convert a Peptide2DNA aligment to a Peptide2Peptide alignment.

    How to handle frameshifts?
    """

    map_query2sbjct = alignlib_lite.py_makeAlignmentVector()

    query_pos = query_from
    sbjct_pos = 0
    sbjct_genome_pos = sbjct_from
    sbjct_residues = []
    codon = ""

    for state, l_query, l_sbjct in alignment:

        query_increment = 0
        sbjct_increment = 0

        if state == "M":

            query_increment = l_query
            sbjct_increment = l_sbjct / 3
            if genomic_sequence:
                codon = genomic_sequence[
                    sbjct_genome_pos:sbjct_genome_pos + l_sbjct]

        elif state == "S":
            if l_query:
                sbjct_increment = 1
                query_increment = 1

            if genomic_sequence:
                codon += genomic_sequence[sbjct_genome_pos:
                                          sbjct_genome_pos + l_sbjct]

        elif state == "G":
            query_increment = l_query
            sbjct_increment = l_sbjct / 3
            if genomic_sequence:
                codon += genomic_sequence[sbjct_genome_pos:
                                          sbjct_genome_pos + l_sbjct]

        elif state == "P":
            # only increment query, sbjct does not advance.
            query_increment = l_query

        if query_increment and sbjct_increment:
            alignlib_lite.py_addDiagonal2Alignment(map_query2sbjct,
                                                   query_pos, query_pos +
                                                   query_increment,
                                                   sbjct_pos - query_pos)

        if sbjct_increment and genomic_sequence:
            for x in range(0, len(codon), 3):
                sbjct_residues.append(MapCodon2AA(codon[x:x + 3]))
            codon = ""

        query_pos += query_increment
        sbjct_pos += sbjct_increment

        sbjct_genome_pos += l_sbjct

    return map_query2sbjct, "".join(sbjct_residues)
Пример #27
0
    def read(self, line):

        data = string.split(line[:-1], "\t")

        if len(data) == 26:
            (
                self.mPredictionId,
                self.mQueryToken,
                self.mSbjctToken,
                self.mSbjctStrand,
                self.mRank,
                self.score,
                self.mQueryFrom,
                self.mQueryTo,
                self.mQueryAli,
                self.mSbjctFrom,
                self.mSbjctTo,
                self.mSbjctAli,
                self.mQueryLength,
                self.mQueryCoverage,
                self.mNGaps,
                self.mNFrameShifts,
                self.mNIntrons,
                self.mNSplits,
                self.mNStopCodons,
                self.mPercentIdentity,
                self.mPercentSimilarity,
                self.mTranslation,
                self.mSbjctGenomeFrom,
                self.mSbjctGenomeTo,
                self.mAlignmentString,
                self.mNAssembled,
            ) = data
        elif len(data) == 25:
            (
                self.mPredictionId,
                self.mQueryToken,
                self.mSbjctToken,
                self.mSbjctStrand,
                self.mRank,
                self.score,
                self.mQueryFrom,
                self.mQueryTo,
                self.mQueryAli,
                self.mSbjctFrom,
                self.mSbjctTo,
                self.mSbjctAli,
                self.mQueryLength,
                self.mQueryCoverage,
                self.mNGaps,
                self.mNFrameShifts,
                self.mNIntrons,
                self.mNSplits,
                self.mNStopCodons,
                self.mPercentIdentity,
                self.mPercentSimilarity,
                self.mTranslation,
                self.mSbjctGenomeFrom,
                self.mSbjctGenomeTo,
                self.mAlignmentString,
            ) = data
        elif len(data) == 24:
            (
                self.mQueryToken,
                self.mSbjctToken,
                self.mSbjctStrand,
                self.mRank,
                self.score,
                self.mQueryFrom,
                self.mQueryTo,
                self.mQueryAli,
                self.mSbjctFrom,
                self.mSbjctTo,
                self.mSbjctAli,
                self.mQueryLength,
                self.mQueryCoverage,
                self.mNGaps,
                self.mNFrameShifts,
                self.mNIntrons,
                self.mNSplits,
                self.mNStopCodons,
                self.mPercentIdentity,
                self.mPercentSimilarity,
                self.mTranslation,
                self.mSbjctGenomeFrom,
                self.mSbjctGenomeTo,
                self.mAlignmentString,
            ) = data
        elif len(data) == 23:
            (
                self.mQueryToken,
                self.mSbjctToken,
                self.mSbjctStrand,
                self.mRank,
                self.score,
                self.mQueryFrom,
                self.mQueryTo,
                self.mQueryAli,
                self.mSbjctFrom,
                self.mSbjctTo,
                self.mSbjctAli,
                self.mQueryLength,
                self.mQueryCoverage,
                self.mNGaps,
                self.mNFrameShifts,
                self.mNIntrons,
                self.mNSplits,
                self.mNStopCodons,
                self.mPercentIdentity,
                self.mPercentSimilarity,
                self.mTranslation,
                self.mSbjctGenomeFrom,
                self.mSbjctGenomeTo,
            ) = data
            self.mAlignmentString = ""
        else:
            raise ValueError, "unknown format: %i fields in line %s" % (
                len(data), line[:-1])

        (self.score, self.mQueryCoverage,
         self.mPercentIdentity, self.mPercentSimilarity) = map(
             float, (self.score, self.mQueryCoverage, self.mPercentIdentity,
                     self.mPercentSimilarity))

        (self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength,
         self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom,
         self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits,
         self.mNStopCodons, self.mNFrameShifts, self.mNAssembled) = map(
             int, (self.mPredictionId, self.mQueryFrom, self.mQueryTo,
                   self.mQueryLength, self.mSbjctFrom, self.mSbjctTo,
                   self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps,
                   self.mNIntrons, self.mNSplits, self.mNStopCodons,
                   self.mNFrameShifts, self.mNAssembled))

        if self.mExpand:
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector(
            )

            if self.mQueryAli != "" and self.mSbjctAli != "":

                alignlib_lite.py_AlignmentFormatExplicit(
                    self.mQueryFrom, self.mQueryAli, self.mSbjctFrom,
                    self.mSbjctAli).copy(self.mMapPeptide2Translation)

            self.mMapPeptide2Genome = Genomics.String2Alignment(
                self.mAlignmentString)
Пример #28
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("-e",
                      "--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.add_option(
        "-m",
        "--mode",
        dest="mode",
        type="choice",
        choices=("global", "local"),
        help="alignment mode, global=nw, local=sw [default=%default].")

    parser.set_defaults(
        gop=-12.0,
        gep=-2.0,
        format="fasta",
        mode="local",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info("read 2 multiple alignments")

    mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format)
    mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format)

    cmali1 = Mali.convertMali2Alignlib(mali1)
    cmali2 = Mali.convertMali2Alignlib(mali2)

    if options.mode == "local":
        mode = alignlib_lite.py_ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib_lite.py_ALIGNMENT_GLOBAL

    alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop,
                                                     options.gep)

    alignlib_lite.py_setDefaultEncoder(
        alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20))
    alignlib_lite.py_setDefaultLogOddor(
        alignlib_lite.py_makeLogOddorDirichlet(0.3))
    alignlib_lite.py_setDefaultRegularizor(
        alignlib_lite.py_makeRegularizorDirichletPrecomputed())

    cprofile1 = alignlib_lite.py_makeProfile(cmali1)
    cprofile2 = alignlib_lite.py_makeProfile(cmali2)

    result = alignlib_lite.py_makeAlignmentVector()

    alignator.align(result, cprofile1, cprofile2)

    E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result))

    cmali1.add(cmali2, result)

    outmali = Mali.convertAlignlib2Mali(cmali1,
                                        identifiers=mali1.getIdentifiers() +
                                        mali2.getIdentifiers())

    outmali.writeToFile(options.stdout, format=options.format)

    # write footer and output benchmark information.
    E.Stop()
Пример #29
0
    def Add(self,
            const_other,
            combine_contig=False,
            allow_overlap=False,
            contig_size=0,
            combine_queries=False,
            as_intron=False):
        """add one entry to another.

        This procedure allows to add

        - predictions on different contigs if combine_contig = True
        - overlapping predictions on the same query if allow_overlap = True
        - results from different queries if combine_queries = True

        - if as_intron is set to true, the new fragment is added as an intron.

        """

        # create working copies of each prediction
        other = const_other.getCopy()
        this = self.getCopy()

        other.Expand()
        this.Expand()

        if as_intron:
            code = "I"
        else:
            code = "P"

        # check for query overlaps
        if this.mQueryToken == other.mQueryToken:

            query_overlap = max(
                0,
                min(this.mQueryTo, other.mQueryTo) -
                max(this.mQueryFrom, other.mQueryFrom) + 1)

            if query_overlap > 0:

                if allow_overlap:
                    overlap = query_overlap
                    # if queries overlap, truncate this before adding the other
                    this.mMapPeptide2Translation.removeRowRegion(
                        this.mQueryTo - overlap + 1, this.mQueryTo)
                    other.mMapPeptide2Translation.moveAlignment(0, -overlap)
                    this.mQueryTo -= overlap
                    this.mTranslation = this.mTranslation[:-overlap]

                    # remove aligned residues from the back
                    for x in range(len(this.mMapPeptide2Genome) - 1, 0, -1):
                        if this.mMapPeptide2Genome[x][1] <= overlap:
                            overlap -= this.mMapPeptide2Genome[x][1]
                            del this.mMapPeptide2Genome[x]
                        else:
                            break
                    this.mMapPeptide2Genome[-1] = (
                        this.mMapPeptide2Genome[-1][0],
                        this.mMapPeptide2Genome[-1][1] - overlap,
                        this.mMapPeptide2Genome[-1][2] - overlap * 3)
                else:
                    raise ValueError, "refusing to add overlapping entries: overlap = %i, queries:\n%s\n%s\n, set allow_overlap = True " % (
                        query_overlap, str(this), str(other))

        else:
            if not combine_queries:
                raise ValueError, "refusing to add different queries - set combine_queries = True."

        if this.mSbjctToken != other.mSbjctToken or \
                this.mSbjctStrand != other.mSbjctStrand:
            if combine_contig:
                this.mSbjctToken += "-" + other.mSbjctToken
                this.mSbjctStrand += other.mSbjctStrand
            else:
                raise ValueError, "can not add different sbjct."

        sbjct_overlap = max(
            0,
            min(this.mSbjctGenomeTo, other.mSbjctGenomeTo) -
            max(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom), 0)

        if sbjct_overlap > 0:
            if not combine_contig:
                raise ValueError, "refusing to add overlapping entries: overlap = %i, sbjct:\n%s\n%s\n" % (
                    sbjct_overlap, str(this), str(other))

        if this.mSbjctToken == other.mSbjctToken:

            # set precedence
            if this.mSbjctGenomeFrom < other.mSbjctGenomeFrom:
                first = this
                second = other
            else:
                first = other
                second = this

            # get length of gap
            d_na = second.mSbjctGenomeFrom - first.mSbjctGenomeTo

            if this.mQueryToken != other.mQueryToken:
                d_aa = first.mQueryLength - first.mQueryTo
                # create a new virtual query by concatenating
                # the two queries
                this.mQueryToken += "-" + other.mQueryToken

                # sort out the alignment
                second.mMapPeptide2Translation.moveAlignment(
                    first.mQueryLength, 0)

                this.mQueryLength = first.mQueryLength + second.mQueryLength

            else:
                d_aa = second.mQueryFrom - first.mQueryTo - 1

            this.mSbjctGenomeFrom = min(this.mSbjctGenomeFrom,
                                        other.mSbjctGenomeFrom)
            this.mSbjctGenomeTo = max(this.mSbjctGenomeTo,
                                      other.mSbjctGenomeTo)

            this.mMapPeptide2Genome = first.mMapPeptide2Genome + \
                [(code, d_aa, d_na)] + second.mMapPeptide2Genome
            this.mTranslation = first.mTranslation + second.mTranslation

            second.mMapPeptide2Translation.moveAlignment(0, first.mSbjctTo - 1)

        else:
            # join on different contigs
            d_na = contig_size - this.mSbjctGenomeTo + \
                other.mSbjctGenomeFrom + query_overlap * 3
            d_aa = other.mQueryFrom - this.mQueryTo - 1
            this.mMapPeptide2Genome += [(code, d_aa, d_na), ] + \
                other.mMapPeptide2Genome
            this.mTranslation += other.mTranslation
            other.mMapPeptide2Translation.moveAlignment(0, this.mSbjctTo - 1)

            this.mSbjctGenomeFrom = this.mSbjctGenomeFrom
            this.mSbjctGenomeTo = contig_size + other.mSbjctGenomeTo

        # now fill self from first and this
        self.mQueryToken = first.mQueryToken
        self.mQueryLength = this.mQueryLength

        nthis = this.mMapPeptide2Translation.getLength(
        ) - this.mMapPeptide2Translation.getNumGaps()
        nother = other.mMapPeptide2Translation.getLength(
        ) - other.mMapPeptide2Translation.getNumGaps()

        self.mMapPeptide2Genome = first.mMapPeptide2Genome
        self.mSbjctGenomeFrom = this.mSbjctGenomeFrom
        self.mSbjctGenomeTo = this.mSbjctGenomeTo

        # there might be some reference counting issues, thus
        # do it the explicit way.
        alignlib_lite.py_addAlignment2Alignment(this.mMapPeptide2Translation,
                                                other.mMapPeptide2Translation)
        self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_addAlignment2Alignment(self.mMapPeptide2Translation,
                                                this.mMapPeptide2Translation)

        self.mTranslation = this.mTranslation

        self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom()
        self.mQueryTo = self.mMapPeptide2Translation.getRowTo()
        self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom()
        self.mSbjctTo = self.mMapPeptide2Translation.getColTo()

        self.mQueryCoverage = 100.0 * \
            (self.mQueryTo - self.mQueryFrom + 1) / float(self.mQueryLength)

        self.mAlignmentString = string.join(
            map(lambda x: string.join(map(str, x), " "),
                self.mMapPeptide2Genome), " ")

        f = alignlib_lite.py_AlignmentFormatEmssions(
            self.mMapPeptide2Translation)
        self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment

        # summary parameters
        self.mRank = max(this.mRank, other.mRank)
        self.score += other.score
        self.mNGaps += other.mNGaps
        self.mNFrameShifts += other.mNFrameShifts
        self.mNIntrons += other.mNIntrons + 1
        self.mNStopCodons += other.mNStopCodons

        nnew = self.mMapPeptide2Translation.getLength(
        ) - self.mMapPeptide2Translation.getNumGaps()

        self.mPercentIdentity = min(
            100.0,
            (self.mPercentIdentity * nthis + other.mPercentIdentity * nother) /
            nnew)
        self.mPercentSimilarity = min(
            100.0, (self.mPercentSimilarity * nthis +
                    other.mPercentSimilarity * nother) / nnew)

        self.mNAssembled += 1 + other.mNAssembled
Пример #30
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "-s", "--sequences", dest="filename_sequences", type="string",
                       help="peptide sequence [Default=%default]" )

    parser.add_option( "-f", "--format", dest="format", type="string",
                       help="output format [Default=%default]" )

    parser.add_option( "-e", "--expand",  dest="expand", action="store_true",
                       help="expand positions from peptide to nucleotide alignment [Default=%default]")

    parser.add_option( "-m", "--map",  dest="filename_map", type="string",
                       help="map alignments [Default=%default]")
    
    parser.add_option( "-c", "--codons",  dest="require_codons", action="store_true",
                       help="require codons [Default=%default]")

    parser.add_option( "--one-based-coordinates",  dest="one_based_coordinates", action="store_true",
                       help="expect one-based coordinates. The default are zero based coordinates [Default=%default].")

    parser.add_option( "--no-identical",  dest="no_identical", action="store_true",
                       help="do not output identical pairs [Default=%default]" )

    parser.add_option( "-g", "--no-gaps",  dest="no_gaps", action="store_true",
                       help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option( "-x", "--exons",  dest="filename_exons", type="string",
                       help="filename with exon boundaries [Default=%default]")
    
    parser.add_option( "-o", "--outfile",  dest="filename_outfile", type="string",
                       help="filename to save links [Default=%default]")

    parser.add_option( "--min-length",  dest="min_length", type="int",
                       help="minimum length of alignment [Default=%default]")

    parser.add_option( "--filter",  dest="filename_filter", type="string",
                       help="given a set of previous alignments, only write new pairs [Default=%default].")

    parser.set_defaults(
        filename_sequences = None,
        filename_exons = None,
        filename_map = None,
        filename_outfile = None,
        no_gaps = False,
        format = "fasta",
        expand = False,
        require_codons = False,
        no_identical = False,
        min_length = 0,
        report_step = 100,
        one_based_coordinates = False,
        filename_filter = None)

    (options, args) = E.Start( parser, add_mysql_options = True )

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") )
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i sequences\n" % len(sequences) )
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries( open(options.filename_exons, "r") )
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i exons\n" % len(exons) )
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#": continue
            m = Map()
            m.read( line )
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i maps\n" % len(map_old2new) )
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:        
            options.stdlog.write( "# reading filtering information.\n" )
            sys.stdout.flush()
            
        map_pair2hids = {}

        if os.path.exists( options.filename_filter ):
            
            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator( infile )

            while 1:
                cur_record = iterator.next()
                if cur_record is None: break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None: break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids: map_pair2hids[id] = []

                map_pair2hids[id].append( s )

            infile.close()
            
        if options.loglevel >= 1:        
            options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids) )
            sys.stdout.flush()
    else:
        map_pair2hids = None
        
    if options.loglevel >= 1:
        options.stdlog.write( "# finished input in %i seconds.\n" % (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None
        
    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links( sys.stdin ):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write( "# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1) )
                sys.stdout.flush()
                
        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write( "# read link %s\n" %  str(link) )
            
        row_seq = alignlib_lite.py_makeSequence( sequences[link.mQueryToken] )
        col_seq = alignlib_lite.py_makeSequence( sequences[link.mSbjctToken] )

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3 
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment( link.mQueryAli, 3 )
            link.mSbjctAli = ScaleAlignment( link.mSbjctAli, 3 )            
            
        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli,
            link.mSbjctFrom, link.mSbjctAli ).copy(  map_row2col )
        
        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write( "# combining in row with %s\n" %\
                                      str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mQueryToken].mMapOld2New ) ))

            alignlib_lite.py_combineAlignment( tmp1_map_row2col,
                                      map_old2new[link.mQueryToken].mMapOld2New,
                                      map_row2col,
                                      alignlib_lite.py_RR )
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col )

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()            
            if options.loglevel >= 3:
                options.stdlog.write( "# combining in col with %s\n" %\
                                      str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mSbjctToken].mMapOld2New ) ))

            alignlib_lite.py_combineAlignment( tmp1_map_row2col,
                                       map_row2col,
                                       map_old2new[link.mSbjctToken].mMapOld2New,
                                       alignlib_lite.py_CR )
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col )

        dr = row_seq.getLength() - map_row2col.getRowTo() 
        dc = col_seq.getLength() - map_row2col.getColTo() 
        if dr < 0 or dc < 0:
            raise ValueError("out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" %\
                                          (link.mQueryToken,
                                           link.mSbjctToken,
                                           row_seq.getLength(),
                                           col_seq.getLength(),
                                           str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))
            

        if options.loglevel >= 2:
            options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, 
                                                                         row_seq, 
                                                                         col_seq )) + "\n" )
        ## check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()
            
            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write( "# %s\n" % str(map_row2col) )
                options.stdlog.write( "# %s\n" % str(link) )
                options.stdlog.write( "# %s\n" % str(map_old2new[link.mQueryToken]) )
                options.stdlog.write( "# %s\n" % str(map_old2new[link.mSbjctToken]) )
                options.stdlog.write( "#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, 
                                                                                    row_seq,
                                                                                    col_seq ) )

                raise ValueError("incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken))

        ## if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            ## Get overlapping segments
            segments = Exons.MatchExons( map_row2col, exons1, exons2 )
            
            for a,b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in the input files.

                from1, to1 = GetAdjustedBoundaries( a, exons1 )
                from2, to2 = GetAdjustedBoundaries( b, exons2 )

                alignlib_lite.py_copyAlignment( tmp1_map_row2col, map_row2col,
                                       from1+1, to1, from2+1, to2 )
                
                mode = Write( tmp1_map_row2col, row_seq, col_seq, link,
                              no_gaps = options.no_gaps,
                              no_identical = options.no_identical,
                              min_length = options.min_length,
                              suffix1="_%s" % str(a),
                              suffix2="_%s" % str(b),
                              outfile = outfile,
                              pair_filter = map_pair2hid,
                              format = options.format )

                if mode not in counts: counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write( map_row2col, row_seq, col_seq, link,
                          min_length = options.min_length,                          
                          no_gaps = options.no_gaps,
                          no_identical = options.no_identical,
                          outfile = outfile,
                          pair_filter = map_pair2hids,
                          format = options.format )
            
            if mode not in counts: counts[mode] = 0
            counts[mode] += 1

        noutput += 1
        
    if outfile: outfile.close()
    
    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join( map( lambda x,y: "%s=%i" % (x,y), counts.keys(), counts.values() ) ))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped) )

    E.Stop()
Пример #31
0
def AlignCodonBased(seq_wobble,
                    seq_cds,
                    seq_peptide,
                    map_p2c,
                    options,
                    diag_width=2,
                    max_advance=2):
    """advance in codons in seq_wobble and match to nucleotides in seq_cds.

    Due to alinglib this is all in one-based coordinates.
    Takes care of frameshifts.
    """

    map_p2c.clear()

    gop, gep = -1.0, -1.0
    matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation(
        1, -10, 1, alignlib_lite.py_getDefaultEncoder())

    pep_seq = seq_peptide.asString()
    cds_seq = seq_cds.asString()
    wobble_seq = seq_wobble.asString()

    lcds = seq_cds.getLength()
    lwobble = seq_wobble.getLength()
    y = 0
    x = 0

    last_start = None

    while x < lwobble and y < lcds:

        xr = seq_wobble.asResidue(x)
        # skip over masked chars in wobble - these are gaps
        if seq_wobble.asChar(x) == "X":
            x += 1
            continue

        # skip over masked chars in wobble - these are from
        # masked chars in the peptide sequence
        # Note to self: do not see all implications of this change
        # check later.
        if seq_wobble.asChar(x) == "N":
            x += 1
            continue

        # skip over gaps in wobble
        if seq_wobble.asChar(x) == "-":
            x += 1
            continue

        s = matrix.getValue(xr, seq_cds.asResidue(y))

        if options.loglevel >= 6:
            if (x % 3 == 0):
                c = seq_cds.asChar(y) + seq_cds.asChar(y +
                                                       1) + seq_cds.asChar(y +
                                                                           2)
                options.stdlog.write(
                    "# c=%s, x=%i, y=%i, aa=%s target=%s\n" %
                    (c, x, y, Genomics.MapCodon2AA(c), pep_seq[int(x / 3)]))

            options.stdlog.write(
                "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" %
                (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr,
                 seq_cds.asResidue(y), str(s)))

        # deal with mismatches
        if s <= 0:

            tmp_map_p2c = alignlib_lite.py_makeAlignmentVector()

            # backtrack to previous three codons and align
            # three codons for double frameshifts that span two codons and
            # produce two X's and six WWWWWW.

            # number of nucleotides to extend (should be multiple of 3)
            # less than 12 caused failure for some peptides.
            d = 15

            # extend by amound dx
            dx = (x % 3) + d

            x_start = max(0, x - dx)
            # map to ensure that no ambiguous residue mappings
            # exist after re-alignment
            y_start = max(0,
                          map_p2c.mapRowToCol(x_start, alignlib_lite.py_RIGHT))

            if (x_start, y_start) == last_start:
                raise ValueError("infinite loop detected")

            last_start = (x_start, y_start)

            x_end = min(x_start + 2 * d, len(wobble_seq))
            y_end = min(y_start + 2 * d, len(cds_seq))

            wobble_fragment = alignlib_lite.py_makeSequence(
                wobble_seq[x_start:x_end])
            cds_fragment = alignlib_lite.py_makeSequence(
                cds_seq[y_start:y_end])

            AlignExhaustive(wobble_fragment, cds_fragment, "", tmp_map_p2c,
                            options)

            if options.loglevel >= 10:
                options.stdlog.write(
                    "# fragmented alignment from %i-%i, %i-%i:\n%s\n" %
                    (x_start, x_end, y_start, y_end,
                     str(
                         alignlib_lite.py_AlignmentFormatExplicit(
                             tmp_map_p2c, wobble_fragment, cds_fragment))))

                options.stdlog.flush()

            # clear alignment
            map_p2c.removeRowRegion(x_start, x_end)
            ngap = 0
            last_x, last_y = None, None
            for xxx in range(tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo()):
                yyy = tmp_map_p2c.mapRowToCol(xxx)

                if yyy >= 0:
                    x = xxx + x_start
                    y = yyy + y_start
                    xr = seq_wobble.asResidue(x)
                    s = matrix.getValue(seq_wobble.asResidue(x),
                                        seq_cds.asResidue(y))
                    if s < 0:
                        raise ValueError(
                            "mismatched residue wobble: %i (%s), cds: %i (%s)"
                            % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y)))

                    map_p2c.addPair(x, y, s)
                    last_x, last_y = x, y
                    if options.loglevel >= 6:
                        options.stdlog.write(
                            "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n"
                            % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y),
                               xr, seq_cds.asResidue(y), s))
                        options.stdlog.flush()
                    ngap = 0
                else:
                    ngap += 1

                # treat special case of double frameshifts. They might cause a petide/wobble residue
                # to be eliminated and thus the translated sequences will differ.
                # simply delete the last residue between x and y and move to
                # next codon.
                if ngap == 3:
                    map_p2c.removeRowRegion(last_x, last_x + 1)

                    last_x += 1
                    map_p2c.addPair(last_x, last_y)
                    if options.loglevel >= 6:
                        options.stdlog.write(
                            "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n"
                            % (last_x, seq_wobble.asChar(last_x), last_y,
                               seq_cds.asChar(last_y), xr,
                               seq_cds.asResidue(last_y), s))
                        options.stdlog.flush()
                    ngap = 0

            # exit condition if alignment is shorter than problematic residue
            # need to catch this to avoid infinite loop.
            if tmp_map_p2c.getRowTo() < d:
                if lwobble - x <= 4:
                    # only last codon is missing, so ok
                    break
                else:
                    raise ValueError("failure to align in designated window.")

            s = 0

        s = matrix.getValue(xr, seq_cds.asResidue(y))

        if s < 0:
            raise ValueError("mis-matching residues.")

        map_p2c.addPair(x, y, float(s))

        # advance to next residues
        x += 1
        y += 1

    # sanity checks
    assert (map_p2c.getRowTo() <= seq_wobble.getLength())
    assert (map_p2c.getColTo() <= seq_cds.getLength())
Пример #32
0
def main():

    parser = E.OptionParser( version = "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("--random-proportion", dest="random_proportion", type="float",
                      help="mask randomly columns in multiple alignments [default=%default]" )

    parser.add_option("--random", dest="random", action="store_true",
                      help="shuffle quality scores before masking [default=%default]" )

    parser.set_defaults(
        quality_threshold = 40,
        quality_file = "quality",
        filename_map = None,
        frame = 3,
        )

    (options, args) = E.Start( parser )

    ##################################################
    ##################################################
    ##################################################
    ## read map
    ##################################################
    infile = open(options.filename_map) 
    map_genes2genome = {}
    for match in Blat.iterator( infile ):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    ## get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta( options.quality_file )
    quality.setTranslator( IndexedFasta.TranslatorBytes() )

    ##################################################
    ##################################################
    ##################################################
    ## main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write( "cluster_id\tstart\tend\n" )

    for line in options.stdin:
        if line.startswith("cluster_id"): continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn( "gene_id %s not found in map." % gene_id )
            continue
        
        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are 
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative: 
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment( map_gene2mali, alignment )

        # get quality scores
        quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)


        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores

        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR )
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp,c in enumerate(alignment):
                if c == "-": continue
                y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom 
                if y < 0: continue
                positions.append( y )
            scores = [ quality_scores[ x ] for x in positions ]
            random.shuffle(scores)
            for p,q in zip( positions,scores): quality_scores[p] = q

        # negative strand
        to_mask = []
        ## reverse position
        rp = len(alignment)
        for fp,c in enumerate(alignment):
            rp -= 1
            if c == "-": continue
            y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom
            if y < 0: continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative: p = rp
                else: p = fp
                E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \
                             (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) )
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend( list( range(start, start + options.frame) ) )
                else:
                    to_mask.append( p ) 

        regions = Iterators.group_by_distance( sorted(to_mask) )
            
        for start,end in regions:
            options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) )

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed) )

    E.Stop()
Пример #33
0
    def Align( self, method, anchor = 0, loglevel = 1 ):
        """align a pair of sequences.
        get rid of this and use a method class instead in the future
        """
        
        map_a2b = alignlib_lite.py_makeAlignmentVector()
        s1 = "A" * anchor + self.mSequence1 + "A" * anchor
        s2 = "A" * anchor + self.mSequence2 + "A" * anchor    

        self.strand = "+"

        if method == "dialign":
            dialign = WrapperDialign.Dialign( self.mOptionsDialign )
            dialign.Align( s1, s2, map_a2b )
        elif method == "blastz":
            blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ )
            blastz.Align( s1, s2, map_a2b )
            if blastz.isReverseComplement():
                self.strand = "-"
                self.mSequence2 = Genomics.complement( self.mSequence2 )

        elif method == "dialignlgs":
            dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS )
            dialignlgs.Align( s1, s2, map_a2b ) 
        elif method == "dba":
            dba = WrapperDBA.DBA()
            dba.Align( s1, s2, map_a2b )
        elif method == "clustal":
            raise NotImplementedError( "clustal wrapper needs to be updated")
            clustal = WrapperClustal.Clustal()
            clustal.Align( s1, s2, map_a2b )
        elif method == "nw":
            seq1 = alignlib_lite.py_makeSequence( s1 )
            seq2 = alignlib_lite.py_makeSequence( s2 )
            alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_GLOBAL,
                                                      gop=-12.0,
                                                      gep=-2.0 )
            alignator.align( map_a2b, seq1, seq2 )
        elif method == "sw":                        
            seq1 = alignlib_lite.py_makeSequence( s1 )
            seq2 = alignlib_lite.py_makeSequence( s2 )
            alignlib_lite.py_performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw )
        else:
            ## use callback function
            method(s1, s2, map_a2b)

        if map_a2b.getLength() == 0:
            raise AlignmentError("empty alignment")

        if anchor:
            map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() )
            map_a2b.removeRowRegion( 1, anchor)        
            map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() )        
            map_a2b.removeColRegion( 1, anchor)
            map_a2b.moveAlignment( -anchor, -anchor )

        f = alignlib_lite.py_AlignmentFormatExplicit( map_a2b, 
                                              alignlib_lite.py_makeSequence( self.mSequence1),
                                              alignlib_lite.py_makeSequence( self.mSequence2) )

        self.mMethod = method
        self.mAlignment = map_a2b
        self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
        f = alignlib_lite.py_AlignmentFormatEmissions( map_a2b )
        self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
        self.mAlignmentFrom1 = map_a2b.getRowFrom()
        self.mAlignmentTo1 = map_a2b.getRowTo()        
        self.mAlignmentFrom2 = map_a2b.getColFrom()
        self.mAlignmentTo2 = map_a2b.getColTo()        
        self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
        self.mAligned = self.mLength - self.mNumGaps

        self.SetPercentIdentity()
        self.SetBlockSizes()
Пример #34
0
    parser.add_option("-o",
                      "--options",
                      dest="options",
                      type="string",
                      help="BlastZ options.")

    parser.set_defaults(input_filename_seq1=None,
                        input_filename_seq2=None,
                        options="B=0 C=2")

    (options, args) = E.Start(parser)

    wrapper = BlastZ(options.options)

    import alignlib_lite
    seqs1 = Genomics.ReadPeptideSequences(
        open(options.input_filename_seq1, "r"))
    seqs2 = Genomics.ReadPeptideSequences(
        open(options.input_filename_seq2, "r"))
    seq1 = seqs1[seqs1.keys()[0]]
    seq2 = seqs2[seqs2.keys()[0]]
    result = alignlib_lite.py_makeAlignmentVector()
    wrapper.Align(seq1, seq2, result)

    print str(
        alignlib_lite.py_AlignmentFormatExplicit(
            result, alignlib_lite.py_makeSequence(seq1),
            alignlib_lite.py_makeSequence(seq2)))

    E.Stop()
Пример #35
0
def AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options,
                     diag_width = 2, max_advance = 2 ):
    """advance in codons in seq_wobble and match to nucleotides in seq_cds.

    Due to alinglib this is all in one-based coordinates.
    Takes care of frameshifts.
    """
    
    map_p2c.clear()

    gop, gep = -1.0, -1.0
    matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder() )

    pep_seq = seq_peptide.asString()
    cds_seq = seq_cds.asString()
    wobble_seq = seq_wobble.asString()
    
    lcds = seq_cds.getLength()
    lwobble = seq_wobble.getLength()
    y = 0
    x = 0

    last_start = None

    while x < lwobble and y < lcds:

        xr = seq_wobble.asResidue( x )
        # skip over masked chars in wobble - these are gaps
        if seq_wobble.asChar(x) == "X": 
            x += 1
            continue

        # skip over masked chars in wobble - these are from
        # masked chars in the peptide sequence
        # Note to self: do not see all implications of this change
        # check later.
        if seq_wobble.asChar(x) == "N": 
            x += 1
            continue

        # skip over gaps in wobble 
        if seq_wobble.asChar(x) == "-": 
            x += 1
            continue

        s = matrix.getValue( xr, seq_cds.asResidue(y) )

        if options.loglevel >= 6:
            if (x % 3 == 0):
                c = seq_cds.asChar(y) + seq_cds.asChar(y+1) + seq_cds.asChar(y+2)
                options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y,
                                                                                 Genomics.MapCodon2AA( c ),
                                                                                 pep_seq[int(x/3)]) )
                                      
            options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % \
                                      (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s) ))
            
        # deal with mismatches
        if s <= 0:

            tmp_map_p2c = alignlib_lite.py_makeAlignmentVector()

            ## backtrack to previous three codons and align
            ## three codons for double frameshifts that span two codons and
            ## produce two X's and six WWWWWW.

            ## number of nucleotides to extend (should be multiple of 3)
            ## less than 12 caused failure for some peptides.
            d = 15
            
            # extend by amound dx
            dx = (x % 3) + d
            
            x_start = max(0, x - dx )
            # map to ensure that no ambiguous residue mappings
            # exist after re-alignment
            y_start = max(0, map_p2c.mapRowToCol( x_start, alignlib_lite.py_RIGHT ))

            if (x_start, y_start) == last_start:
                raise ValueError( "infinite loop detected" )

            last_start = (x_start, y_start)

            x_end = min(x_start + 2 * d, len(wobble_seq) )
            y_end = min(y_start + 2 * d, len(cds_seq) )

            wobble_fragment = alignlib_lite.py_makeSequence(wobble_seq[x_start:x_end])
            cds_fragment = alignlib_lite.py_makeSequence(cds_seq[y_start:y_end])
            
            AlignExhaustive( wobble_fragment, cds_fragment, "", tmp_map_p2c, options )

            if options.loglevel >= 10:
                 options.stdlog.write("# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end,
                                                                                           y_start, y_end,
                                                                                           str(alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c,
                                                                                                                                 wobble_fragment, 
                                                                                                                                 cds_fragment ))))
                 
                 options.stdlog.flush()

            ## clear alignment
            map_p2c.removeRowRegion( x_start, x_end )
            ngap = 0
            last_x, last_y = None, None
            for xxx in range( tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo() ):
                yyy = tmp_map_p2c.mapRowToCol(xxx)

                if yyy >= 0:
                    x = xxx + x_start
                    y = yyy + y_start
                    xr = seq_wobble.asResidue(x)
                    s = matrix.getValue( seq_wobble.asResidue(x), seq_cds.asResidue(y) )
                    if s < 0:
                        raise ValueError("mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y)))
                    
                    map_p2c.addPair( x, y, s)
                    last_x, last_y = x, y
                    if options.loglevel >= 6:
                        options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \
                                              (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s ))
                        options.stdlog.flush()
                    ngap = 0
                else:
                    ngap += 1

                # treat special case of double frameshifts. They might cause a petide/wobble residue
                # to be eliminated and thus the translated sequences will differ.
                # simply delete the last residue between x and y and move to next codon.
                if ngap == 3:
                    map_p2c.removeRowRegion( last_x, last_x + 1 )

                    last_x += 1
                    map_p2c.addPair( last_x, last_y )
                    if options.loglevel >= 6:
                        options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \
                                              (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s ))
                        options.stdlog.flush()                    
                    ngap = 0
                    
            ## exit condition if alignment is shorter than problematic residue
            ## need to catch this to avoid infinite loop.
            if tmp_map_p2c.getRowTo() < d:
                if lwobble - x <= 4:
                    ## only last codon is missing, so ok
                    break
                else:
                    raise ValueError("failure to align in designated window.")
                    
            s = 0
            
        s = matrix.getValue( xr, seq_cds.asResidue(y) )

        if s < 0:
            raise ValueError("mis-matching residues.")
        
        map_p2c.addPair( x, y, float(s) )
        
        # advance to next residues
        x += 1
        y += 1

    # sanity checks
    assert( map_p2c.getRowTo() <= seq_wobble.getLength() )
    assert( map_p2c.getColTo() <= seq_cds.getLength() )
Пример #36
0
    parser.set_defaults( \
        input_filename_seq1 = None,
        input_filename_seq2 = None,
        options = "B=0 C=2")
    
    (options, args) = E.Start( parser ) 
    
    wrapper = BlastZ( options.options )

    import alignlib_lite
    seqs1 = Genomics.ReadPeptideSequences( open(options.input_filename_seq1, "r") )
    seqs2 = Genomics.ReadPeptideSequences( open(options.input_filename_seq2, "r") )
    seq1 = seqs1[seqs1.keys()[0]]
    seq2 = seqs2[seqs2.keys()[0]]    
    result = alignlib_lite.py_makeAlignmentVector()
    wrapper.Align( seq1, seq2, result) 

    print str( alignlib_lite.py_AlignmentFormatExplicit( result,
                                                 alignlib_lite.py_makeSequence( seq1 ),
                                                 alignlib_lite.py_makeSequence( seq2 ) ) )
    
    E.Stop()
        
            
        
                                 
        
        
        
        
Пример #37
0
def Alignment2PeptideAlignment(alignment,
                               query_from=0,
                               sbjct_from=0,
                               genomic_sequence=None):
    """convert a Peptide2DNA aligment to a Peptide2Peptide alignment.

    How to handle frameshifts?
    """

    map_query2sbjct = alignlib_lite.py_makeAlignmentVector()

    query_pos = query_from
    sbjct_pos = 0
    sbjct_genome_pos = sbjct_from
    sbjct_residues = []
    codon = ""

    for state, l_query, l_sbjct in alignment:

        query_increment = 0
        sbjct_increment = 0

        if state == "M":

            query_increment = l_query
            sbjct_increment = l_sbjct / 3
            if genomic_sequence:
                codon = genomic_sequence[sbjct_genome_pos:sbjct_genome_pos +
                                         l_sbjct]

        elif state == "S":
            if l_query:
                sbjct_increment = 1
                query_increment = 1

            if genomic_sequence:
                codon += genomic_sequence[sbjct_genome_pos:sbjct_genome_pos +
                                          l_sbjct]

        elif state == "G":
            query_increment = l_query
            sbjct_increment = l_sbjct / 3
            if genomic_sequence:
                codon += genomic_sequence[sbjct_genome_pos:sbjct_genome_pos +
                                          l_sbjct]

        elif state == "P":
            # only increment query, sbjct does not advance.
            query_increment = l_query

        if query_increment and sbjct_increment:
            alignlib_lite.py_addDiagonal2Alignment(map_query2sbjct, query_pos,
                                                   query_pos + query_increment,
                                                   sbjct_pos - query_pos)

        if sbjct_increment and genomic_sequence:
            for x in range(0, len(codon), 3):
                sbjct_residues.append(MapCodon2AA(codon[x:x + 3]))
            codon = ""

        query_pos += query_increment
        sbjct_pos += sbjct_increment

        sbjct_genome_pos += l_sbjct

    return map_query2sbjct, "".join(sbjct_residues)
Пример #38
0
        print globals()["__doc__"], msg
        sys.exit(2)

    for o, a in optlist:
        if o in ("-v", "--verbose"):
            param_loglevel = int(a)
        elif o in ("--version", ):
            print "version="
            sys.exit(0)
        elif o in ("-h", "--help"):
            print globals()["__doc__"]
            sys.exit(0)

    alignator = alignlib_lite.py_makeAlignatorDPFull(
        alignlib_lite.py_ALIGNMENT_LOCAL, param_gop, param_gep)
    map_query2token = alignlib_lite.py_makeAlignmentVector()

    for line in sys.stdin:
        if line[0] == "#":
            continue

        query_token, sbjct_token, query_sequence, sbjct_sequence = string.split(
            line[:-1], "\t")

        map_query2token.clear()
        row = alignlib_lite.py_makeSequence(query_sequence)
        col = alignlib_lite.py_makeSequence(sbjct_sequence)
        alignator.align(map_query2token, row, col)

        pidentity = 100.0 * \
            alignlib_lite.py_calculatePercentIdentity(
Пример #39
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--random-proportion",
        dest="random_proportion",
        type="float",
        help="mask randomly columns in multiple alignments [default=%default]")

    parser.add_option(
        "--random",
        dest="random",
        action="store_true",
        help="shuffle quality scores before masking [default=%default]")

    parser.set_defaults(
        quality_threshold=40,
        quality_file="quality",
        filename_map=None,
        frame=3,
    )

    (options, args) = E.Start(parser)

    ##################################################
    ##################################################
    ##################################################
    # read map
    ##################################################
    infile = open(options.filename_map)
    map_genes2genome = {}
    for match in Blat.iterator(infile):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    # get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta(options.quality_file)
    quality.setTranslator(IndexedFasta.TranslatorBytes())

    ##################################################
    ##################################################
    ##################################################
    # main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write("cluster_id\tstart\tend\n")

    for line in options.stdin:
        if line.startswith("cluster_id"):
            continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn("gene_id %s not found in map." % gene_id)
            continue

        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative:
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment(map_gene2mali, alignment)

        # get quality scores
        quality_scores = quality.getSequence(match.mSbjctId, "+",
                                             match.mSbjctFrom, match.mSbjctTo)

        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores
        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali,
                                          map_gene2genome, alignlib_lite.py_RR)
        # print str(alignlib_lite.py_AlignmentFormatEmissions(
        # map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp, c in enumerate(alignment):
                if c == "-":
                    continue
                y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
                if y < 0:
                    continue
                positions.append(y)
            scores = [quality_scores[x] for x in positions]
            random.shuffle(scores)
            for p, q in zip(positions, scores):
                quality_scores[p] = q

        # negative strand
        to_mask = []
        # reverse position
        rp = len(alignment)
        for fp, c in enumerate(alignment):
            rp -= 1
            if c == "-":
                continue
            y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
            if y < 0:
                continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative:
                    p = rp
                else:
                    p = fp
                E.debug(
                    "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i"
                    % (cluster_id, p, c, match.mSbjctId, match.strand,
                       map_mali2genome.mapRowToCol(fp), quality_scores[y]))
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend(list(range(start, start + options.frame)))
                else:
                    to_mask.append(p)

        regions = Iterators.group_by_distance(sorted(to_mask))

        for start, end in regions:
            options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed))

    E.Stop()
Пример #40
0
def getMapPeptide2Cds(peptide_sequence, cds_sequence, options):
    """get map between peptide sequence and cds sequence.

    The returned alignment is in nucleotides.

    """

    # remove whitespaces form protein sequence
    p = re.sub(" ", "", peptide_sequence)

    # remove gaps and whitespaces from cds
    c = re.sub("[ .-]", "", cds_sequence)

    w = Genomics.Protein2Wobble(p.upper())

    if options.loglevel >= 6:
        options.stdlog.write("# peptide original (%5i): %s\n" % (len(p), p))
        options.stdlog.write("# cds original     (%5i): %s\n" % (len(c), c))
        options.stdlog.write("# wobble sequence  (%5i): %s\n" % (len(w), w))
        options.stdlog.flush()

    seq_wobble = alignlib_lite.py_makeSequence(w)
    seq_cds = alignlib_lite.py_makeSequence(c.upper())
    seq_peptide = alignlib_lite.py_makeSequence(p)

    map_p2c = alignlib_lite.py_makeAlignmentVector()

    try:
        AlignCodonBased(
            seq_wobble, seq_cds, seq_peptide, map_p2c, options=options)
    except ValueError as msg:
        raise ValueError("mapping error for sequence: %s" % (msg))

    # if there are more than five frameshifts - do exhaustive alignment
    max_gaps = 5
    num_peptide_gaps = len(re.sub("[^-]", "", p))
    ngaps = map_p2c.getNumGaps() - \
        (num_peptide_gaps * 3) - abs(len(w) - len(c))

    if options.loglevel >= 6:
        options.stdlog.write(
            "# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n" % (ngaps, num_peptide_gaps))
        printPrettyAlignment(seq_wobble, seq_cds, p, map_p2c, options)

    if ngaps > max_gaps:
        if options.loglevel >= 2:
            options.stdlog.write(
                "# too many gaps (%i>%i), realigning exhaustively.\n" % (ngaps, max_gaps))
            options.stdlog.flush()
        full_map_p2c = alignlib_lite.py_makeAlignmentVector()

        AlignExhaustive(
            seq_wobble, seq_cds, seq_peptide, full_map_p2c, options)
        if options.loglevel >= 6:
            options.stdlog.write("# full alignment between wobble and cds:\n")
            options.stdlog.flush()
            printPrettyAlignment(seq_wobble, seq_cds, p, full_map_p2c, options)

        map_p2c = full_map_p2c

    # remove incomplete codons
    x = 0
    while x < len(p) * 3:
        if (map_p2c.mapRowToCol(x) < 0 or
                map_p2c.mapRowToCol(x + 1) < 0 or
                map_p2c.mapRowToCol(x + 2) < 0):
            map_p2c.removeRowRegion(x, x + 3)
        x += 3

    if map_p2c.getLength() == 0:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: empty alignment\n")
            if options.loglevel >= 6:
                options.stdlog.write("# peptide original: %s\n" % p)
                options.stdlog.write("# cds original    : %s\n" % c)
                options.stdlog.write("# wobble sequence : %s\n" % w)

        raise ValueError("empty alignment")

    assert(map_p2c.getRowTo() <= seq_wobble.getLength())
    assert(map_p2c.getColTo() <= seq_cds.getLength())

    return map_p2c
Пример #41
0
            continue

        if pair.mMethod == "unaligned":
            unaligned_pair = pair
            pair.mType1 = GetIntronType(unaligned_pair.mAlignedSequence1)
            pair.mType2 = GetIntronType(unaligned_pair.mAlignedSequence2)
            do_print = param_echo_unaligned
        else:
            do_print = 1
            if param_is_compressed:
                if unaligned_pair and \
                        unaligned_pair.mToken1 == pair.mToken1 and \
                        unaligned_pair.mToken2 == pair.mToken2 and \
                        unaligned_pair.mIntronId1 == pair.mIntronId1:

                    map_a2b = alignlib_lite.py_makeAlignmentVector()
                    f = AlignmentFormatEmissions(
                        pair.mFrom1, pair.mAlignedSequence1, pair.mFrom2,
                        pair.mAlignedSequence2).copy(map_a2b)
                    map_a2b.moveAlignment(-unaligned_pair.mFrom1 + 1,
                                          -unaligned_pair.mFrom2 + 1)

                    data = alignlib_lite.py_AlignmentFormatExplicit(
                        map_a2b,
                        alignlib_lite.py_makeSequence(
                            unaligned_pair.mAlignedSequence1),
                        alignlib_lite.py_makeSequence(
                            unaligned_pair.mAlignedSequence2))

                    from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
                    from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo
Пример #42
0
        raise ValueError( "mapping error for sequence: %s" % (msg) )

    ## if there are more than five frameshifts - do exhaustive alignment
    max_gaps = 5
    num_peptide_gaps = len( re.sub("[^-]", "", p ) )
    ngaps = map_p2c.getNumGaps() - (num_peptide_gaps *  3) - abs(len(w)-len(c))
    
    if options.loglevel >= 6:
        options.stdlog.write("# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n" % (ngaps, num_peptide_gaps) )
        PrintPrettyAlignment( seq_wobble, seq_cds, p, map_p2c, options )

    if ngaps > max_gaps:
        if options.loglevel >= 2:
            options.stdlog.write("# too many gaps (%i>%i), realigning exhaustively.\n" % (ngaps, max_gaps ) )
            options.stdlog.flush()
        full_map_p2c = alignlib_lite.py_makeAlignmentVector()
        
        AlignExhaustive( seq_wobble, seq_cds, seq_peptide, full_map_p2c, options )
        if options.loglevel >= 6:
            options.stdlog.write("# full alignment between wobble and cds:\n" )
            options.stdlog.flush()
            PrintPrettyAlignment( seq_wobble, seq_cds, p, full_map_p2c, options )

        map_p2c = full_map_p2c
        
    ## remove incomplete codons
    x = 0
    while x < len(p) * 3:
        if (map_p2c.mapRowToCol( x ) < 0 or \
            map_p2c.mapRowToCol( x+1 ) < 0 or \
            map_p2c.mapRowToCol( x+2 ) < 0 ):
Пример #43
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-o", "--gop", dest="gop", type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("-e", "--gep", dest="gep", type="float",
                      help="gap extension penalty [default=%default].")

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices=("global", "local"),
                      help="alignment mode, global=nw, local=sw [default=%default].")

    parser.set_defaults(
        gop=-12.0,
        gep=-2.0,
        format="fasta",
        mode="local",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info("read 2 multiple alignments")

    mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format)
    mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format)

    cmali1 = Mali.convertMali2Alignlib(mali1)
    cmali2 = Mali.convertMali2Alignlib(mali2)

    if options.mode == "local":
        mode = alignlib_lite.py_ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib_lite.py_ALIGNMENT_GLOBAL

    alignator = alignlib_lite.py_makeAlignatorDPFull(mode,
                                                     options.gop, options.gep)

    alignlib_lite.py_setDefaultEncoder(
        alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20))
    alignlib_lite.py_setDefaultLogOddor(
        alignlib_lite.py_makeLogOddorDirichlet(0.3))
    alignlib_lite.py_setDefaultRegularizor(
        alignlib_lite.py_makeRegularizorDirichletPrecomputed())

    cprofile1 = alignlib_lite.py_makeProfile(cmali1)
    cprofile2 = alignlib_lite.py_makeProfile(cmali2)

    result = alignlib_lite.py_makeAlignmentVector()

    alignator.align(result, cprofile1, cprofile2)

    E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result))

    cmali1.add(cmali2, result)

    outmali = Mali.convertAlignlib2Mali(cmali1,
                                        identifiers=mali1.getIdentifiers() + mali2.getIdentifiers())

    outmali.writeToFile(options.stdout, format=options.format)

    # write footer and output benchmark information.
    E.Stop()
Пример #44
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-s", "--correct-gap-shift", dest="correct_shift",
        action="store_true",
        help="correct gap length shifts in alignments. "
        "Requires alignlib_lite.py ")

    parser.add_argument(
        "-1", "--pattern1", dest="pattern1", type=str,
        help="pattern to extract identifier from in identifiers1. "
        )

    parser.add_argument(
        "-2", "--pattern2", dest="pattern2", type=str,
        help="pattern to extract identifier from in identifiers2. "
        )

    parser.add_argument(
        "-o", "--output-section", dest="output", type=str,
        action="append",
        choices=("diff", "missed", "seqdiff"),
        help="what to output ")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (args, unknown) = E.start(parser, unknowns=True)

    if len(unknown) != 2:
        raise ValueError("two files needed to compare.")

    if args.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            iotools.open_file(unknown[0], "r"))])
    seqs2 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            iotools.open_file(unknown[1], "r"))])

    if not seqs1:
        raise ValueError("first file %s is empty." % (unknown[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (unknown[1]))

    MapIdentifiers(seqs1, args.pattern1)
    MapIdentifiers(seqs2, args.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in args.output
    write_missed2 = "missed" in args.output
    write_seqdiff = "seqdiff" in args.output
    write_diff = "diff" in args.output or write_seqdiff

    for k in sorted(seqs1):
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                args.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences: the first and last residues
                    # can be different for peptide sequences when
                    # comparing my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len([x for x in differences if x[0] == "U" or x[1] == "U"]) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len([x for x in differences if x[0] in "NX" or x[1] in "NX"]) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if args.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print("# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2)))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print("fix\t%s\t%s" % (k, str(f)))

                if not keep:
                    print("# warning: not fixable: %s" % k)

            if write_diff:
                args.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                args.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in sorted(list(seqs2.keys())):
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                args.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    args.stdlog.write("""# Legend:
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))

    E.info(
        "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" %
        (ndiff, ndiff_first, ndiff_last, ndiff_prefix,
         ndiff_selenocysteine, ndiff_masked, nfixed,
         ndiff - ndiff_first - ndiff_last - ndiff_prefix -
         ndiff_selenocysteine - ndiff_masked - nfixed))

    E.stop()
Пример #45
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("--methods", dest="methods", type="choice", action="append",
                      choices=("summary-numbers", "jalview",
                               "positive-site-table", "positive-site-list",
                               "count-positive-sites"),
                      help="methods for analysis.")

    parser.add_option("--selection-mode", dest="selection_mode", type="choice",
                      choices=("all", "consistent", "emes"),
                      help="how to select positive sites.")

    parser.add_option("--prefix", dest="prefix", type="string",
                      help="prefix for rows.")

    parser.add_option("--pattern-input-filenames", dest="pattern_input_filenames", type="string",
                      help="input pattern.")

    parser.add_option("--filter-probability", dest="filter_probability", type="float",
                      help="threshold for probability above which to include positive sites [default=%default].")

    parser.add_option("--filter-omega", dest="filter_omega", type="float",
                      help="threshold for omega above which to include positive sites [default=%default].")

    parser.add_option("--models", dest="models", type="string",
                      help="restrict output to set of site specific models.")

    parser.add_option("--analysis", dest="analysis", type="string",
                      help="restrict output to set of analysis [beb|neb].")

    parser.add_option("--significance-threshold", dest="significance_threshold", type="float",
                      help="significance threshold for log-likelihood test.")

    parser.add_option("--filter-mali", dest="filter_mali", type="choice",
                      choices=("none", "gaps"),
                      help="filter by mali to remove gapped positions.")

    parser.add_option("--filename-mali", dest="filename_mali", type="string",
                      help="filename with multiple alignment used for calculating sites - used for filtering")

    parser.add_option("--filename-map-mali", dest="filename_map_mali", type="string",
                      help="filename with multiple alignment to map sites onto.")

    parser.add_option("--jalview-titles", dest="jalview_titles", type="string",
                      help="comma separated list of jalview annotation titles.")

    parser.add_option("--jalview-symbol", dest="jalview_symbol", type="string",
                      help="symbol to use in jalview.")

    parser.set_defaults(
        methods=[],
        prefix=None,
        filter_probability=0,
        filter_omega=0,
        models="",
        analysis="",
        significance_threshold=0.05,
        selection_mode="consistent",
        filename_mali=None,
        filename_map_mali=None,
        jalview_symbol="*",
        jalview_titles="",
        filter_mali=None,
    )

    (options, args) = E.Start(parser)

    if options.jalview_titles:
        options.jalview_titles = options.jalview_titles.split(",")
    else:
        options.jalview_titles = args

    options.models = options.models.split(",")
    options.analysis = options.analysis.split(",")

    for a in options.analysis:
        if a not in ("beb", "neb"):
            raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a

    for a in options.models:
        if a not in ("8", "2", "3"):
            raise "unknown model: '%s', possible values are 2, 3, 8" % a

    codeml = WrapperCodeML.CodeMLSites()

    # filter and extract functions
    filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega
    extract_f = lambda x: x.mResidue

    # read multiple results
    results = []
    ninput, noutput, nskipped = 0, 0, 0

    headers = []
    for f in args:
        ninput += 1
        try:
            results.append(codeml.parseOutput(open(f, "r").readlines()))
        except WrapperCodeML.UsageError:
            if options.loglevel >= 1:
                options.stdlog.write("# no input from %s\n" % f)
            nskipped += 1
            continue
        noutput += 1
        headers.append(f)

    # map of nested model (key) to more general model
    map_nested_models = {'8': '7',
                         '2': '1',
                         '3': '0'}

    if options.filename_mali:
        mali = Mali.Mali()
        mali.readFromFile(open(options.filename_mali, "r"))
    else:
        mali = None

    ###############################################################
    ###############################################################
    ###############################################################
    # use multiple alignment to map residues to a reference mali
    # or a sequence.
    ###############################################################
    if options.filename_map_mali:

        if not mali:
            raise "please supply the input multiple alignment, if residues are to be mapped."

        # translate the alignments
        def translate(s):
            sequence = s.mString
            seq = []
            for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]:
                aa = Genomics.MapCodon2AA(codon)
                seq.append(aa)

            s.mString = "".join(seq)

        tmali = Mali.Mali()
        tmali.readFromFile(open(options.filename_mali, "r"))
        tmali.apply(translate)

        tmap_mali = Mali.Mali()
        tmap_mali.readFromFile(open(options.filename_map_mali, "r"))

        if tmap_mali.getAlphabet() == "na":
            tmap_mali.apply(translate)

        map_old2new = alignlib_lite.py_makeAlignmentVector()

        mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali))

        if tmap_mali.getLength() == 1:

            s = tmap_mali.values()[0].mString
            mali2 = alignlib_lite.py_makeSequence(s)
            # see if you can find an identical subsequence and then align to
            # thisD
            for x in tmali.values():
                if s in re.sub("[- .]+", "", x.mString):
                    mali1 = alignlib_lite.py_makeSequence(x.mString)
                    break
        else:
            mali2 = alignlib_lite.py_makeProfileFromMali(
                convertMali2Mali(tmap_mali))

        alignator = alignlib_lite.py_makeAlignatorDPFull(
            alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0)
        alignator.align(map_old2new, mali1, mali2)

        consensus = tmap_mali.getConsensus()

        if options.loglevel >= 4:
            options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet())
            options.stdlog.write("# orig  : %s\n" % tmali.getConsensus())
            options.stdlog.write("# mapped: %s\n" % consensus)
            options.stdlog.write("# alignment: %s\n" % map_old2new.Write())
    else:
        map_old2new = None

    for method in options.methods:

        if method == "summary-numbers":

            options.stdlog.write(
                """# Numbers of positive sites.
#
# The consistent row/column contains positive sites that are significant
# (above thresholds for probability and omega) for all models/analysis
# that have been selected (label: cons).
#
# The log-likelihood ratio test is performed for model pairs, depending
# on the output chosen.
# Significance threshold: %6.4f
# The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0.
#
""" % options.significance_threshold )

            # write header
            if options.prefix:
                options.stdout.write("prefix\t")

            options.stdout.write("method\tnseq\t")
            h = []
            for model in options.models:
                for analysis in options.analysis:
                    h.append("%s%s" % (analysis, model))
                h.append("p%s" % (model))
                h.append("df%s" % (model))
                h.append("chi%s" % (model))
                h.append("lrt%s" % (model))

            options.stdout.write("\t".join(h))
            options.stdout.write("\tcons\tpassed\tfilename\n")

            nmethod = 0

            consistent_cols = [None for x in range(len(options.analysis))]
            passed_tests = {}
            for m in options.models:
                passed_tests[m] = 0

            for result in results:

                row_consistent = None

                if options.prefix:
                    options.stdout.write("%s" % (options.prefix))

                options.stdout.write("%i" % nmethod)
                options.stdout.write("\t%i" % (result.mNumSequences))

                npassed = 0

                for model in options.models:

                    sites = result.mSites[model]

                    # do significance test
                    full_model, null_model = model, map_nested_models[model]

                    lrt = Stats.doLogLikelihoodTest(
                        result.mSites[full_model].mLogLikelihood,
                        result.mSites[full_model].mNumParameters,
                        result.mSites[null_model].mLogLikelihood,
                        result.mSites[null_model].mNumParameters,
                        options.significance_threshold)

                    x = 0
                    for analysis in options.analysis:

                        if analysis == "neb":
                            s = set(
                                map(extract_f, filter(filter_f, sites.mNEB.mPositiveSites)))

                        elif analysis == "beb":
                            s = set(
                                map(extract_f, filter(filter_f, sites.mBEB.mPositiveSites)))

                        options.stdout.write("\t%i" % (len(s)))

                        if not lrt.mPassed:
                            s = set()

                        if row_consistent is None:
                            row_consistent = s
                        else:
                            row_consistent = row_consistent.intersection(s)

                        if consistent_cols[x] is None:
                            consistent_cols[x] = s
                        else:
                            consistent_cols[x] = consistent_cols[
                                x].intersection(s)

                        x += 1

                    if lrt.mPassed:
                        c = "passed"
                        passed_tests[model] += 1
                        npassed += 1
                    else:
                        c = "failed"

                    options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" %
                                         (lrt.mProbability,
                                          lrt.mDegreesFreedom,
                                          lrt.mChiSquaredValue,
                                          c))

                options.stdout.write(
                    "\t%i\t%i\t%s\n" % (len(row_consistent), npassed, headers[nmethod]))

                nmethod += 1

            if options.prefix:
                options.stdout.write("%s\t" % options.prefix)

            options.stdout.write("cons")

            row_consistent = None
            total_passed = 0
            for model in options.models:

                x = 0

                for analysis in options.analysis:

                    s = consistent_cols[x]
                    if s is None:
                        s = set()

                    options.stdout.write("\t%i" % (len(s)))

                    if row_consistent is None:
                        row_consistent = s
                    else:
                        row_consistent = row_consistent.intersection(s)

                    x += 1

                options.stdout.write("\tna\t%i" % passed_tests[model])
                total_passed += passed_tests[model]

            options.stdout.write(
                "\t%i\t%i\n" % (len(row_consistent), total_passed))

        elif method == "jalview":

            options.stdout.write("JALVIEW_ANNOTATION\n")
            options.stdout.write("# Created: %s\n\n" %
                                 (time.asctime(time.localtime(time.time()))))

            l = 1
            x = 0
            for result in results:

                sites, significance = selectPositiveSites(
                    [result], options.selection_mode, options, mali)

                codes = [""] * result.mLength

                if len(sites) == 0:
                    continue

                for site in sites:
                    codes[site - 1] = options.jalview_symbol

                options.stdout.write(
                    "NO_GRAPH\t%s\t%s\n" % (options.jalview_titles[x], "|".join(codes)))
                x += 1

        elif method == "count-positive-sites":

            sites, significance = selectPositiveSites(
                results, options.selection_mode, options, mali)

            options.stdout.write("%i\n" % (len(sites)))

        elif method in ("positive-site-table", ):

            sites, significance = selectPositiveSites(
                results, options.selection_mode, options, mali)

            headers = ["site", "P"]
            if map_old2new:
                headers.append("mapped")
                headers.append("Pm")

            options.stdout.write("\t".join(headers) + "\n")

            sites = list(sites)
            sites.sort()
            nmapped, nunmapped = 0, 0
            for site in sites:
                values = [site, "%6.4f" % significance[site]]

                if map_old2new:
                    r = map_old2new.mapRowToCol(site)
                    if r == 0:
                        values.append("na")
                        values.append("")
                        nunmapped += 1
                        if options.loglevel >= 2:
                            options.stdlog.write(
                                "# unmapped residue: %i\n" % site)
                    else:
                        values.append(r)
                        values.append(consensus[r - 1])
                        nmapped += 1

                options.stdout.write("\t".join(map(str, (values))) + "\n")

            if options.loglevel >= 1:
                options.stdlog.write("# sites: ninput=%i, noutput=%i, nskipped=%i\n" % (
                    len(sites), nmapped, nunmapped))

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
Пример #46
0
def Alignment2DNA(alignment, query_from=0, sbjct_from=0):
    """convert a peptide2genome alignment to a nucleotide2nucleotide
    alignment.

    Instead of peptide coordinates, the alignment will be
    in codon coordinates.

    Arguments
    ---------
    aligment : list
        List of tuples of the alignment in CIGAR format.
    query_from : int
        Start position of alignment on peptide sequence.
    sbjct_from : int
        Start position of alignment on nucleotide sequence.

    Returns
    -------
    alignment : object
       The alignment as an alignlib.AlignmentVector object.
    """

    map_query2sbjct = alignlib_lite.py_makeAlignmentVector()

    # count in nucleotides for query
    query_pos = query_from * 3
    sbjct_pos = sbjct_from

    for state, l_query, l_sbjct in alignment:

        # count as nucleotides
        l_query *= 3

        if state in ("A", "B", "C"):

            if state in ("A"):
                l_query = 0
            elif state in ("B"):
                l_query = 1
            elif state in ("C"):
                l_query = 2

        elif state in ("a", "b", "c"):

            if state in ("a"):
                l_query = 0
            elif state in ("b"):
                l_query = 2
            elif state in ("c"):
                l_query = 1

        elif state == "S":
            l_query = l_sbjct

        if l_query > 0 and l_sbjct > 0:
            alignlib_lite.addDiagonal2Alignment(map_query2sbjct,
                                                query_pos, query_pos +
                                                l_query,
                                                sbjct_pos - query_pos)

        query_pos += l_query
        sbjct_pos += l_sbjct

    return map_query2sbjct
Пример #47
0
    def Add( self, const_other,
             combine_contig = False,
             allow_overlap = False,
             contig_size = 0,
             combine_queries = False,
             as_intron = False ):
        """add one entry to another.

        This procedure allows to add
        
        - predictions on different contigs if combine_contig = True
        - overlapping predictions on the same query if allow_overlap = True
        - results from different queries if combine_queries = True

        - if as_intron is set to true, the new fragment is added as an intron.
        
        """

        ## create working copies of each prediction
        other = const_other.getCopy()
        this  = self.getCopy()

        other.Expand()
        this.Expand()

        if as_intron:
            code = "I"
        else:
            code = "P"

        ## check for query overlaps
        if this.mQueryToken == other.mQueryToken:

            query_overlap = max( 0, min(this.mQueryTo, other.mQueryTo) -\
                                 max(this.mQueryFrom, other.mQueryFrom) + 1)

            if query_overlap > 0:

                if allow_overlap:
                    overlap = query_overlap
                    ## if queries overlap, truncate this before adding the other
                    this.mMapPeptide2Translation.removeRowRegion( this.mQueryTo - overlap + 1, this.mQueryTo )
                    other.mMapPeptide2Translation.moveAlignment( 0, -overlap )
                    this.mQueryTo -= overlap
                    this.mTranslation = this.mTranslation[:-overlap]

                    ## remove aligned residues from the back
                    for x in range(len(this.mMapPeptide2Genome) - 1, 0, -1):
                        if this.mMapPeptide2Genome[x][1] <= overlap:
                            overlap -= this.mMapPeptide2Genome[x][1]
                            del this.mMapPeptide2Genome[x]
                        else:
                            break
                    this.mMapPeptide2Genome[-1] = (this.mMapPeptide2Genome[-1][0],
                                                   this.mMapPeptide2Genome[-1][1] - overlap,
                                                   this.mMapPeptide2Genome[-1][2] - overlap * 3)
                else:
                    raise ValueError, "refusing to add overlapping entries: overlap = %i, queries:\n%s\n%s\n, set allow_overlap = True " % (query_overlap, str(this), str(other))


        else:
            if not combine_queries:
                raise ValueError, "refusing to add different queries - set combine_queries = True."

        if this.mSbjctToken != other.mSbjctToken or \
               this.mSbjctStrand != other.mSbjctStrand :
            if combine_contig:
                this.mSbjctToken += "-" + other.mSbjctToken
                this.mSbjctStrand += other.mSbjctStrand
            else:
                raise ValueError, "can not add different sbjct."                

        sbjct_overlap = max(0, min(this.mSbjctGenomeTo, other.mSbjctGenomeTo) -\
                            max(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom), 0)

        if sbjct_overlap > 0:
            if not combine_contig:
                raise ValueError, "refusing to add overlapping entries: overlap = %i, sbjct:\n%s\n%s\n" % (sbjct_overlap, str(this), str(other))

        if this.mSbjctToken == other.mSbjctToken:

            ## set precedence
            if this.mSbjctGenomeFrom < other.mSbjctGenomeFrom:
                first = this
                second = other
            else:
                first = other
                second = this

            ## get length of gap
            d_na = second.mSbjctGenomeFrom - first.mSbjctGenomeTo

            if this.mQueryToken != other.mQueryToken:
                d_aa = first.mQueryLength - first.mQueryTo                 
                # create a new virtual query by concatenating
                # the two queries
                this.mQueryToken += "-" + other.mQueryToken

                # sort out the alignment
                second.mMapPeptide2Translation.moveAlignment( first.mQueryLength, 0 )

                this.mQueryLength = first.mQueryLength + second.mQueryLength

            else:
                d_aa = second.mQueryFrom - first.mQueryTo - 1
            
            this.mSbjctGenomeFrom = min(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom )
            this.mSbjctGenomeTo = max(this.mSbjctGenomeTo, other.mSbjctGenomeTo )        

            this.mMapPeptide2Genome = first.mMapPeptide2Genome + [(code, d_aa, d_na)] + second.mMapPeptide2Genome
            this.mTranslation = first.mTranslation + second.mTranslation

            second.mMapPeptide2Translation.moveAlignment( 0, first.mSbjctTo - 1 )
            
        else:
            ## join on different contigs
            d_na = contig_size - this.mSbjctGenomeTo + other.mSbjctGenomeFrom + query_overlap * 3
            d_aa = other.mQueryFrom - this.mQueryTo - 1
            this.mMapPeptide2Genome += [(code, d_aa, d_na),] + other.mMapPeptide2Genome
            this.mTranslation += other.mTranslation 
            other.mMapPeptide2Translation.moveAlignment( 0, this.mSbjctTo - 1 )

            this.mSbjctGenomeFrom = this.mSbjctGenomeFrom
            this.mSbjctGenomeTo = contig_size + other.mSbjctGenomeTo

        ## now fill self from first and this
        self.mQueryToken = first.mQueryToken
        self.mQueryLength = this.mQueryLength
        
        nthis  = this.mMapPeptide2Translation.getLength() - this.mMapPeptide2Translation.getNumGaps()
        nother = other.mMapPeptide2Translation.getLength() - other.mMapPeptide2Translation.getNumGaps()

        self.mMapPeptide2Genome = first.mMapPeptide2Genome
        self.mSbjctGenomeFrom = this.mSbjctGenomeFrom
        self.mSbjctGenomeTo= this.mSbjctGenomeTo
        
        ## there might be some reference counting issues, thus
        ## do it the explicit way.
        alignlib_lite.py_addAlignment2Alignment( this.mMapPeptide2Translation, other.mMapPeptide2Translation)
        self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_addAlignment2Alignment( self.mMapPeptide2Translation, this.mMapPeptide2Translation )
        
        self.mTranslation = this.mTranslation
        
        self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom()
        self.mQueryTo = self.mMapPeptide2Translation.getRowTo()
        self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom()
        self.mSbjctTo = self.mMapPeptide2Translation.getColTo()
        
        self.mQueryCoverage = 100.0 * (self.mQueryTo - self.mQueryFrom + 1) / float(self.mQueryLength)

        self.mAlignmentString = string.join( map( \
                                      lambda x: string.join(map(str, x), " "),
                                      self.mMapPeptide2Genome), " ")

        f = alignlib_lite.py_AlignmentFormatEmssions( self.mMapPeptide2Translation )
        self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment

        ## summary parameters
        self.mRank = max( this.mRank, other.mRank)
        self.score += other.score
        self.mNGaps += other.mNGaps
        self.mNFrameShifts += other.mNFrameShifts
        self.mNIntrons += other.mNIntrons + 1
        self.mNStopCodons += other.mNStopCodons
        
        nnew = self.mMapPeptide2Translation.getLength() - self.mMapPeptide2Translation.getNumGaps()
        
        self.mPercentIdentity = min( 100.0, (self.mPercentIdentity * nthis + other.mPercentIdentity * nother) / nnew )
        self.mPercentSimilarity = min( 100.0, (self.mPercentSimilarity * nthis + other.mPercentSimilarity * nother) / nnew )

        self.mNAssembled += 1 + other.mNAssembled
Пример #48
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: diff_fasta.py 2781 2009-09-10 11:33:14Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true",
                      help="correct gap length shifts in alignments. Requires alignlib_lite.py_ "
                      "[%default]")
    parser.add_option("-1", "--pattern1", dest="pattern1", type="string",
                      help="pattern to extract identifier from in identifiers1. "
                      "[%default]")
    parser.add_option("-2", "--pattern2", dest="pattern2", type="string",
                      help="pattern to extract identifier from in identifiers2. "
                      "[%default]")

    parser.add_option("-o", "--output", dest="output", type="choice", action="append",
                      choices=("diff", "missed", "seqdiff"),
                      help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.Start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ but alignlib not found")

    seqs1 = Genomics.ReadPeptideSequences(IOTools.openFile(args[0], "r"))
    seqs2 = Genomics.ReadPeptideSequences(IOTools.openFile(args[1], "r"))

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in seqs1:
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences:
                    # the first and last residues can be different for peptide sequences when comparing
                    # my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len(filter(lambda x: x[0] == "U" or x[1] == "U", differences)) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len(filter(lambda x: x[0] in "NX" or x[1] in "NX", differences)) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if options.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print "fix\t%s\t%s" % (k, str(f))

                if not keep:
                    print "# warning: not fixable: %s" % k

            if write_diff:
                options.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in seqs2.keys():
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    options.stdlog.write( """# Legend:
# seqs1:          number of sequences in set 1
# seqs2:          number of sequences in set 2
# same:           number of identical sequences
# diff:           number of sequences with differences
# nmissed1:       sequences in set 1 that are not found in set 2
# nmissed2:       sequences in set 2 that are not found in set 1
# Type of sequence differences
# first:          only the first residue is different
# last:           only the last residue is different
# prefix:         one sequence is prefix of the other
# selenocysteine: difference due to selenocysteines
# masked:         difference due to masked residues
# fixed:          fixed differences
# other:          other differences
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))
    E.info("ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" %
           (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed,
            ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed))

    E.Stop()
Пример #49
0
    except getopt.error, msg:
        print globals()["__doc__"], msg
        sys.exit(2)

    for o, a in optlist:
        if o in ("-v", "--verbose"):
            param_loglevel = int(a)
        elif o in ("--version",):
            print "version="
            sys.exit(0)
        elif o in ("-h", "--help"):
            print globals()["__doc__"]
            sys.exit(0)

    alignator = alignlib_lite.py_makeAlignatorDPFull(alignlib_lite.py_ALIGNMENT_LOCAL, param_gop, param_gep)
    map_query2token = alignlib_lite.py_makeAlignmentVector()

    for line in sys.stdin:
        if line[0] == "#":
            continue

        query_token, sbjct_token, query_sequence, sbjct_sequence = string.split(line[:-1], "\t")

        map_query2token.clear()
        row = alignlib_lite.py_makeSequence(query_sequence)
        col = alignlib_lite.py_makeSequence(sbjct_sequence)
        alignator.align(map_query2token, row, col)

        pidentity = 100.0 * alignlib_lite.py_calculatePercentIdentity(map_query2token, row, col)
        psimilarity = 100.0 * alignlib_lite.py_calculatePercentSimilarity(map_query2token)
        print string.join(