Exemplo n.º 1
0
 def expand( self ):
     if not self.mMapOld2New:
         self.mMapOld2New = alignlib_lite.py_makeAlignmentVector()
     
         alignlib_lite.py_AlignmentFormatEmissions( 
             self.mOldFrom, self.mOldAli,
             self.mNewFrom, self.mNewAli).copy( self.mMapOld2New )
Exemplo n.º 2
0
    def expand(self):
        if not self.mMapOld2New:
            self.mMapOld2New = alignlib_lite.py_makeAlignmentVector()

            alignlib_lite.py_AlignmentFormatEmissions(
                self.mOldFrom, self.mOldAli, self.mNewFrom,
                self.mNewAli).copy(self.mMapOld2New)
Exemplo n.º 3
0
 def GetMap( self ):
     """return map between the two segments."""
     if self.mAlignmentFrom1 and self.mAlignmentFrom2:
         map_a2b = alignlib_lite.py_makeAlignmentVector()
         alignlib_lite.py_AlignmentFormatEmissions( 
             self.mAlignmentFrom1, self.mAlignment1,
             self.mAlignmentFrom2, self.mAlignment2 ).copy( map_a2b )
         return map_a2b
     else:
         return None
Exemplo n.º 4
0
    def fillFromTable( self, table_row ):

        if len(table_row) == 25:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString) = table_row
        elif len(table_row) == 26:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              self.mNAssembled) = table_row[:26]
        elif len(table_row) > 26:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              self.mNAssembled) = table_row[:26]
        else:
            raise ValueError, "unknown format: %i fields" % len(data)
            sys.exit(0)
            
        if self.mExpand:
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector()

            if self.mQueryAli != "" and self.mSbjctAli != "":
                alignlib_lite.py_AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli,
                                                   self.mSbjctFrom, self.mSbjctAli ).copy( self.mMapPeptide2Translation )

            self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
Exemplo n.º 5
0
    def fillFromTable(self, table_row):

        if len(table_row) == 25:
            (self.mPredictionId, self.mQueryToken, self.mSbjctToken,
             self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom,
             self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo,
             self.mSbjctAli, self.mQueryLength, self.mQueryCoverage,
             self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits,
             self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity,
             self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
             self.mAlignmentString) = table_row
        elif len(table_row) == 26:
            (self.mPredictionId, self.mQueryToken, self.mSbjctToken,
             self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom,
             self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo,
             self.mSbjctAli, self.mQueryLength, self.mQueryCoverage,
             self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits,
             self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity,
             self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
             self.mAlignmentString, self.mNAssembled) = table_row[:26]
        elif len(table_row) > 26:
            (self.mPredictionId, self.mQueryToken, self.mSbjctToken,
             self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom,
             self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo,
             self.mSbjctAli, self.mQueryLength, self.mQueryCoverage,
             self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits,
             self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity,
             self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
             self.mAlignmentString, self.mNAssembled) = table_row[:26]
        else:
            raise ValueError, "unknown format: %i fields" % len(data)
            sys.exit(0)

        if self.mExpand:
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector(
            )

            if self.mQueryAli != "" and self.mSbjctAli != "":
                alignlib_lite.py_AlignmentFormatEmissions(
                    self.mQueryFrom, self.mQueryAli, self.mSbjctFrom,
                    self.mSbjctAli).copy(self.mMapPeptide2Translation)

            self.mMapPeptide2Genome = Genomics.String2Alignment(
                self.mAlignmentString)
Exemplo n.º 6
0
    def Expand( self ):
        self.mExpand = True
        
        if self.mMapPeptide2Translation.getLength() > 0:
            f = alignlib_lite.py_AlignmentFormatEmissions( self.mMapPeptide2Translation )
            self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment
            self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom()
            self.mQueryTo = self.mMapPeptide2Translation.getRowTo()
            self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom()
            self.mSbjctTo = self.mMapPeptide2Translation.getColTo()

        self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
Exemplo n.º 7
0
    def Expand(self):
        self.mExpand = True

        if self.mMapPeptide2Translation.getLength() > 0:
            f = alignlib_lite.py_AlignmentFormatEmissions(
                self.mMapPeptide2Translation)
            self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment
            self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom()
            self.mQueryTo = self.mMapPeptide2Translation.getRowTo()
            self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom()
            self.mSbjctTo = self.mMapPeptide2Translation.getColTo()

        self.mMapPeptide2Genome = Genomics.String2Alignment(
            self.mAlignmentString)
Exemplo n.º 8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--sequences",
                      dest="filename_sequences",
                      type="string",
                      help="peptide sequence [Default=%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output format [Default=%default]")

    parser.add_option(
        "-e",
        "--expand",
        dest="expand",
        action="store_true",
        help=
        "expand positions from peptide to nucleotide alignment [Default=%default]"
    )

    parser.add_option("-m",
                      "--map",
                      dest="filename_map",
                      type="string",
                      help="map alignments [Default=%default]")

    parser.add_option("-c",
                      "--codons",
                      dest="require_codons",
                      action="store_true",
                      help="require codons [Default=%default]")

    parser.add_option(
        "--one-based-coordinates",
        dest="one_based_coordinates",
        action="store_true",
        help=
        "expect one-based coordinates. The default are zero based coordinates [Default=%default]."
    )

    parser.add_option("--no-identical",
                      dest="no_identical",
                      action="store_true",
                      help="do not output identical pairs [Default=%default]")

    parser.add_option(
        "-g",
        "--no-gaps",
        dest="no_gaps",
        action="store_true",
        help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option("-x",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exon boundaries [Default=%default]")

    parser.add_option("-o",
                      "--outfile",
                      dest="filename_outfile",
                      type="string",
                      help="filename to save links [Default=%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of alignment [Default=%default]")

    parser.add_option(
        "--filter",
        dest="filename_filter",
        type="string",
        help=
        "given a set of previous alignments, only write new pairs [Default=%default]."
    )

    parser.set_defaults(filename_sequences=None,
                        filename_exons=None,
                        filename_map=None,
                        filename_outfile=None,
                        no_gaps=False,
                        format="fasta",
                        expand=False,
                        require_codons=False,
                        no_identical=False,
                        min_length=0,
                        report_step=100,
                        one_based_coordinates=False,
                        filename_filter=None)

    (options, args) = E.Start(parser, add_mysql_options=True)

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences(
            open(options.filename_sequences, "r"))
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i sequences\n" % len(sequences))
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"))
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i exons\n" % len(exons))
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#":
                continue
            m = Map()
            m.read(line)
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i maps\n" % len(map_old2new))
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:
            options.stdlog.write("# reading filtering information.\n")
            sys.stdout.flush()

        map_pair2hids = {}

        if os.path.exists(options.filename_filter):

            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator(infile)

            while 1:
                cur_record = iterator.next()
                if cur_record is None:
                    break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None:
                    break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids:
                    map_pair2hids[id] = []

                map_pair2hids[id].append(s)

            infile.close()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# read filtering information for %i pairs.\n" %
                len(map_pair2hids))
            sys.stdout.flush()
    else:
        map_pair2hids = None

    if options.loglevel >= 1:
        options.stdlog.write("# finished input in %i seconds.\n" %
                             (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None

    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links(sys.stdin):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write("# iterations: %i in %i seconds.\n" %
                                     (iterations, time.time() - t1))
                sys.stdout.flush()

        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write("# read link %s\n" % str(link))

        row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken])
        col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken])

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment(link.mQueryAli, 3)
            link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3)

        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli, link.mSbjctFrom,
            link.mSbjctAli).copy(map_row2col)

        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in row with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mQueryToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New,
                map_row2col, alignlib_lite.py_RR)
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in col with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mSbjctToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_row2col,
                map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR)
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        dr = row_seq.getLength() - map_row2col.getRowTo()
        dc = col_seq.getLength() - map_row2col.getColTo()
        if dr < 0 or dc < 0:
            raise ValueError(
                "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s"
                %
                (link.mQueryToken, link.mSbjctToken, row_seq.getLength(),
                 col_seq.getLength(),
                 str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))

        if options.loglevel >= 2:
            options.stdlog.write(
                str(
                    alignlib_lite.py_AlignmentFormatExplicit(
                        map_row2col, row_seq, col_seq)) + "\n")
        # check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()

            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write("# %s\n" % str(map_row2col))
                options.stdlog.write("# %s\n" % str(link))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mQueryToken]))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mSbjctToken]))
                options.stdlog.write("#\n%s\n" %
                                     alignlib_lite.py_AlignmentFormatExplicit(
                                         map_row2col, row_seq, col_seq))

                raise ValueError(
                    "incomplete codons %i in pair %s - %s" %
                    (naligned, link.mQueryToken, link.mSbjctToken))

        # if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            # Get overlapping segments
            segments = Exons.MatchExons(map_row2col, exons1, exons2)

            for a, b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in
                # the input files.

                from1, to1 = GetAdjustedBoundaries(a, exons1)
                from2, to2 = GetAdjustedBoundaries(b, exons2)

                alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col,
                                               from1 + 1, to1, from2 + 1, to2)

                mode = Write(tmp1_map_row2col,
                             row_seq,
                             col_seq,
                             link,
                             no_gaps=options.no_gaps,
                             no_identical=options.no_identical,
                             min_length=options.min_length,
                             suffix1="_%s" % str(a),
                             suffix2="_%s" % str(b),
                             outfile=outfile,
                             pair_filter=map_pair2hid,
                             format=options.format)

                if mode not in counts:
                    counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write(map_row2col,
                         row_seq,
                         col_seq,
                         link,
                         min_length=options.min_length,
                         no_gaps=options.no_gaps,
                         no_identical=options.no_identical,
                         outfile=outfile,
                         pair_filter=map_pair2hids,
                         format=options.format)

            if mode not in counts:
                counts[mode] = 0
            counts[mode] += 1

        noutput += 1

    if outfile:
        outfile.close()

    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join(
            map(lambda x, y: "%s=%i" %
                (x, y), counts.keys(), counts.values())))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Exemplo n.º 9
0
def pslMap(options):
    """thread psl alignments using intervals.

    """

    if options.format == "gtf":
        use_copy = False
    else:
        use_copy = True

    c = E.Counter()

    min_length = options.min_aligned

    for match, qx, tx in iterator_psl_intervals(options):

        map_query2target = match.getMapQuery2Target()

        c.input += 1

        # if no filter on qx or tx, use full segment
        if qx is None:
            qx = [(match.mQueryFrom, match.mQueryTo, 0)]
        elif tx is None:
            tx = [(match.mSbjctFrom, match.mSbjctTo, 0)]

        E.debug('matches in query: %s' % qx)
        E.debug('matches in target: %s' % tx)

        # if no overlap: return
        if not qx or not tx:
            c.skipped += 1
            E.debug("no matches in query or target - skipped")
            continue

        for query in qx:

            qstart, qend, qval = query

            # skip elements that are too small
            if qend - qstart < min_length:
                E.debug("query too small - skipped at %s:%i-%i" %
                        (match.mQueryId, qstart, qend))
                c.skipped_small_queries += 1
                continue

            E.debug("working on query %s:%i-%i" %
                    (match.mQueryId, qstart, qend))

            mqstart, mqend = (
                map_query2target.mapRowToCol(
                    qstart,
                    alignlib_lite.py_RIGHT),
                map_query2target.mapRowToCol(
                    qend,
                    alignlib_lite.py_LEFT))

            if match.strand == "-":
                qstart, qend = match.mQueryLength - \
                    qend, match.mQueryLength - qstart

            for target in tx:

                tstart, tend, tval = target
                if (tstart >= mqend or tend <= mqstart):
                    E.debug("no overlap: %i-%i (%i-%i) - %i-%i" % (
                        qstart, qend, mqstart, mqend, tstart, tend))
                    continue
                if tend - tstart < min_length:
                    E.debug("target length too short: %i-%i - %i-%i" % (
                        qstart, qend, tstart, tend))
                    continue

                new = alignlib_lite.py_makeAlignmentBlocks()

                if use_copy:
                    # do copy with range filter
                    if options.loglevel >= 3:

                        mtstart, mtend = map_query2target.mapColToRow(
                            tstart), map_query2target.mapColToRow(tend)

                        E.debug(
                            ("query: %i-%i (len=%i)-> %i-%i(len=%i); "
                             "target: %i-%i (len=%i)-> %i-%i (len=%i)") %
                            (qstart, qend,
                             qend - qstart,
                             mqstart, mqend,
                             mqend - mqstart,
                             tstart, tend,
                             tend - tstart,
                             mtstart, mtend,
                             mtend - mtstart))

                    alignlib_lite.py_copyAlignment(
                        new,
                        map_query2target,
                        qstart, qend,
                        tstart, tend)
                else:
                    # do copy with alignment filter
                    map_query = qval
                    if map_query:
                        tmp = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            tmp, map_query2target, map_query,
                            alignlib_lite.py_RR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping query ###########\n")
                            options.stdlog.write(
                                "# %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        map_query)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        tmp)))
                    else:
                        tmp = map_query2target

                    map_target = tval
                    if map_target:
                        new = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            new, tmp, map_target, alignlib_lite.py_CR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping target ###########\n")
                            options.stdlog.write(
                                "# before: %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    tmp)))
                            options.stdlog.write(
                                "# map   : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_target)))
                            options.stdlog.write(
                                "# after : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))
                    else:
                        new = tmp

                if options.loglevel >= 4:
                    E.debug("putative match with intervals: %s and %s: %i-%i" %
                            (str(query), str(target), qstart, qend))
                    if options.loglevel >= 5:
                        E.debug(
                            "input : %s" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                        E.debug("final : %s" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))

                    if new.getLength() > 0:
                        n = match.copy()
                        n.fromMap(new, use_strand=True)
                        E.info("match : %s" % (str(n)))

                if new.getNumAligned() > options.min_aligned:
                    n = match.copy()
                    n.fromMap(new, use_strand=True)
                    options.stdout.write(str(n) + "\n")
                    c.output += 1
                else:
                    c.discarded += 1
                break
            else:
                c.nooverlap += 1

    E.info("map: %s" % str(c))
Exemplo n.º 10
0
def pslMap(options):
    """thread psl alignments using intervals.

    """

    if options.format == "gtf":
        use_copy = False
    else:
        use_copy = True

    c = E.Counter()

    min_length = options.min_aligned

    for match, qx, tx in iterator_psl_intervals(options):

        map_query2target = match.getMapQuery2Target()

        c.input += 1

        # if no filter on qx or tx, use full segment
        if qx is None:
            qx = [(match.mQueryFrom, match.mQueryTo, 0)]
        elif tx is None:
            tx = [(match.mSbjctFrom, match.mSbjctTo, 0)]

        E.debug('matches in query: %s' % qx)
        E.debug('matches in target: %s' % tx)

        # if no overlap: return
        if not qx or not tx:
            c.skipped += 1
            E.debug("no matches in query or target - skipped")
            continue

        for query in qx:

            qstart, qend, qval = query

            # skip elements that are too small
            if qend - qstart < min_length:
                E.debug("query too small - skipped at %s:%i-%i" %
                        (match.mQueryId, qstart, qend))
                c.skipped_small_queries += 1
                continue

            E.debug("working on query %s:%i-%i" %
                    (match.mQueryId, qstart, qend))

            mqstart, mqend = (
                map_query2target.mapRowToCol(
                    qstart,
                    alignlib_lite.py_RIGHT),
                map_query2target.mapRowToCol(
                    qend,
                    alignlib_lite.py_LEFT))

            if match.strand == "-":
                qstart, qend = match.mQueryLength - \
                    qend, match.mQueryLength - qstart

            for target in tx:

                tstart, tend, tval = target
                if (tstart >= mqend or tend <= mqstart):
                    E.debug("no overlap: %i-%i (%i-%i) - %i-%i" % (
                        qstart, qend, mqstart, mqend, tstart, tend))
                    continue
                if tend - tstart < min_length:
                    E.debug("target length too short: %i-%i - %i-%i" % (
                        qstart, qend, tstart, tend))
                    continue

                new = alignlib_lite.py_makeAlignmentBlocks()

                if use_copy:
                    # do copy with range filter
                    if options.loglevel >= 3:

                        mtstart, mtend = map_query2target.mapColToRow(
                            tstart), map_query2target.mapColToRow(tend)

                        E.debug(
                            ("query: %i-%i (len=%i)-> %i-%i(len=%i); "
                             "target: %i-%i (len=%i)-> %i-%i (len=%i)") %
                            (qstart, qend,
                             qend - qstart,
                             mqstart, mqend,
                             mqend - mqstart,
                             tstart, tend,
                             tend - tstart,
                             mtstart, mtend,
                             mtend - mtstart))

                    alignlib_lite.py_copyAlignment(
                        new,
                        map_query2target,
                        qstart, qend,
                        tstart, tend)
                else:
                    # do copy with alignment filter
                    map_query = qval
                    if map_query:
                        tmp = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            tmp, map_query2target, map_query,
                            alignlib_lite.py_RR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping query ###########\n")
                            options.stdlog.write(
                                "# %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        map_query)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        tmp)))
                    else:
                        tmp = map_query2target

                    map_target = tval
                    if map_target:
                        new = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            new, tmp, map_target, alignlib_lite.py_CR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping target ###########\n")
                            options.stdlog.write(
                                "# before: %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    tmp)))
                            options.stdlog.write(
                                "# map   : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_target)))
                            options.stdlog.write(
                                "# after : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))
                    else:
                        new = tmp

                if options.loglevel >= 4:
                    E.debug("putative match with intervals: %s and %s: %i-%i" %
                            (str(query), str(target), qstart, qend))
                    if options.loglevel >= 5:
                        E.debug(
                            "input : %s" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                        E.debug("final : %s" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))

                    if new.getLength() > 0:
                        n = match.copy()
                        n.fromMap(new, use_strand=True)
                        E.info("match : %s" % (str(n)))

                if new.getNumAligned() > options.min_aligned:
                    n = match.copy()
                    n.fromMap(new, use_strand=True)
                    options.stdout.write(str(n) + "\n")
                    c.output += 1
                else:
                    c.discarded += 1
                break
            else:
                c.nooverlap += 1

    E.info("map: %s" % str(c))
Exemplo n.º 11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-o", "--gop", dest="gop", type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("-e", "--gep", dest="gep", type="float",
                      help="gap extension penalty [default=%default].")

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices=("global", "local"),
                      help="alignment mode, global=nw, local=sw [default=%default].")

    parser.set_defaults(
        gop=-12.0,
        gep=-2.0,
        format="fasta",
        mode="local",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info("read 2 multiple alignments")

    mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format)
    mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format)

    cmali1 = Mali.convertMali2Alignlib(mali1)
    cmali2 = Mali.convertMali2Alignlib(mali2)

    if options.mode == "local":
        mode = alignlib_lite.py_ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib_lite.py_ALIGNMENT_GLOBAL

    alignator = alignlib_lite.py_makeAlignatorDPFull(mode,
                                                     options.gop, options.gep)

    alignlib_lite.py_setDefaultEncoder(
        alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20))
    alignlib_lite.py_setDefaultLogOddor(
        alignlib_lite.py_makeLogOddorDirichlet(0.3))
    alignlib_lite.py_setDefaultRegularizor(
        alignlib_lite.py_makeRegularizorDirichletPrecomputed())

    cprofile1 = alignlib_lite.py_makeProfile(cmali1)
    cprofile2 = alignlib_lite.py_makeProfile(cmali2)

    result = alignlib_lite.py_makeAlignmentVector()

    alignator.align(result, cprofile1, cprofile2)

    E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result))

    cmali1.add(cmali2, result)

    outmali = Mali.convertAlignlib2Mali(cmali1,
                                        identifiers=mali1.getIdentifiers() + mali2.getIdentifiers())

    outmali.writeToFile(options.stdout, format=options.format)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 12
0
    alignator = alignlib_lite.py_makeAlignatorDPFull(
        alignlib_lite.py_ALIGNMENT_LOCAL, param_gop, param_gep)
    map_query2token = alignlib_lite.py_makeAlignmentVector()

    for line in sys.stdin:
        if line[0] == "#":
            continue

        query_token, sbjct_token, query_sequence, sbjct_sequence = string.split(
            line[:-1], "\t")

        map_query2token.clear()
        row = alignlib_lite.py_makeSequence(query_sequence)
        col = alignlib_lite.py_makeSequence(sbjct_sequence)
        alignator.align(map_query2token, row, col)

        pidentity = 100.0 * \
            alignlib_lite.py_calculatePercentIdentity(
                map_query2token, row, col)
        psimilarity = 100.0 * \
            alignlib_lite.py_calculatePercentSimilarity(map_query2token)
        print string.join(
            map(str,
                (query_token, sbjct_token, map_query2token.getScore(),
                 alignlib_lite.py_AlignmentFormatEmissions(map_query2token),
                 pidentity, psimilarity, map_query2token.getNumGaps())), "\t")


if __name__ == "__main__":
    sys.exit(main(sys.argv))
Exemplo n.º 13
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: peptides2cds.py 2890 2010-04-07 08:58:54Z andreas $")

    parser.add_option("-p", "--peptides", dest="filename_peptides", type="string",
                      help="filename with peptide sequences [%default]."  )
    
    parser.add_option("-c", "--cds", "--cdnas", dest="filename_cdna", type="string",
                      help="filename with cdna sequences [%default]."  )

    parser.add_option("-m", "--map", dest="filename_map", type="string",
                      help="filename with map of peptide identifiers to cdna identifiers [%default]."  )

    parser.add_option( "--output-identifier", dest="output_identifier", type="choice",
                       choices=("cdna", "peptide"),
                       help="output identifier to use [%default]."  )


    parser.add_option("-f", "--output-format=", dest="output_format", type="choice",
                      choices=("alignment", "fasta"),
                      help="output format.")
    
    parser.set_defaults(
        peptides=None,
        filename_cdna = None,
        output_format="alignment",
        filename_map = None,
        stop_codons = ("TAG", "TAA", "TGA"),
        output_identifier = "peptide",
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if not options.filename_cdna:
        raise ValueError("please supply filename with cds sequences.")

    if options.filename_peptides:
        infile = open(options.filename_peptides, "r") 
        E.info("reading from %s" % options.filename_peptides)
    else:
        E.info("reading from stdin")
        infile = sys.stdin

    if options.filename_map:
        E.info( "reading map" )
        map_peptide2cds = IOTools.readMap( IOTools.openFile( options.filename_map, "r" ) )
        E.info( "read map for %i identifiers" % len(map_peptide2cds) )
    else:
        map_peptide2cds = {}

    E.info( "reading cds sequences" )
        
    cds_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_cdna, "r") )    

    E.info( "read %i cds sequences" % len(cds_sequences))

    ninput, noutput = 0, 0
    nskipped, nnosequence = 0, 0

    # iterate over peptide sequences
    iterator = FastaIterator.FastaIterator( infile )

    use_cds_id = options.output_identifier == "cds"

    for cur_record in iterator:

        ninput += 1
        
        peptide_identifier = re.split("\s+", cur_record.title)[0]
        cds_identifier = map_peptide2cds.get( peptide_identifier, peptide_identifier )

        if cds_identifier not in cds_sequences:
            nnosequence += 1
            continue

        p = cur_record.sequence
        c = cds_sequences[cds_identifier]
        
        E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" % (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c)))

        try:
            map_p2c = getMapPeptide2Cds( p, c, options )
        except ValueError:
            nskipped += 1
            continue
            
        if use_cds_id:
            identifier = cds_identifier
        else:
            identifier = peptide_identifier

        if options.output_format =="alignment":
            options.stdout.write("\t".join( map(str, (identifier, alignlib_lite.py_AlignmentFormatEmissions( map_p2c ),
                                                      len(cur_record.sequence), len(cds_sequences[identifier])) ) )+"\n")
            
        elif options.output_format == "fasta":

            map_p2c.switchRowCol()

            alignatum = alignlib_lite.py_makeAlignatum( c )
            
            alignatum.mapOnAlignment( map_p2c, len(p) * 3 )

            s = alignatum.getString()
            if len(s) != len(p) * 3:
                raise ValueError ("incomplete aligned string for %s: %s, cds=%s" % (cur_record.title, s, c ))
            
            options.stdout.write( ">%s\n%s\n" % (identifier, s ))

        noutput += 1
        sys.stdout.flush()

    E.info( "ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" % (ninput, noutput, nnosequence, nskipped) )
        
    E.Stop()
Exemplo n.º 14
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "-s", "--sequences", dest="filename_sequences", type="string",
                       help="peptide sequence [Default=%default]" )

    parser.add_option( "-f", "--format", dest="format", type="string",
                       help="output format [Default=%default]" )

    parser.add_option( "-e", "--expand",  dest="expand", action="store_true",
                       help="expand positions from peptide to nucleotide alignment [Default=%default]")

    parser.add_option( "-m", "--map",  dest="filename_map", type="string",
                       help="map alignments [Default=%default]")
    
    parser.add_option( "-c", "--codons",  dest="require_codons", action="store_true",
                       help="require codons [Default=%default]")

    parser.add_option( "--one-based-coordinates",  dest="one_based_coordinates", action="store_true",
                       help="expect one-based coordinates. The default are zero based coordinates [Default=%default].")

    parser.add_option( "--no-identical",  dest="no_identical", action="store_true",
                       help="do not output identical pairs [Default=%default]" )

    parser.add_option( "-g", "--no-gaps",  dest="no_gaps", action="store_true",
                       help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option( "-x", "--exons",  dest="filename_exons", type="string",
                       help="filename with exon boundaries [Default=%default]")
    
    parser.add_option( "-o", "--outfile",  dest="filename_outfile", type="string",
                       help="filename to save links [Default=%default]")

    parser.add_option( "--min-length",  dest="min_length", type="int",
                       help="minimum length of alignment [Default=%default]")

    parser.add_option( "--filter",  dest="filename_filter", type="string",
                       help="given a set of previous alignments, only write new pairs [Default=%default].")

    parser.set_defaults(
        filename_sequences = None,
        filename_exons = None,
        filename_map = None,
        filename_outfile = None,
        no_gaps = False,
        format = "fasta",
        expand = False,
        require_codons = False,
        no_identical = False,
        min_length = 0,
        report_step = 100,
        one_based_coordinates = False,
        filename_filter = None)

    (options, args) = E.Start( parser, add_mysql_options = True )

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") )
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i sequences\n" % len(sequences) )
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries( open(options.filename_exons, "r") )
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i exons\n" % len(exons) )
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#": continue
            m = Map()
            m.read( line )
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i maps\n" % len(map_old2new) )
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:        
            options.stdlog.write( "# reading filtering information.\n" )
            sys.stdout.flush()
            
        map_pair2hids = {}

        if os.path.exists( options.filename_filter ):
            
            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator( infile )

            while 1:
                cur_record = iterator.next()
                if cur_record is None: break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None: break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids: map_pair2hids[id] = []

                map_pair2hids[id].append( s )

            infile.close()
            
        if options.loglevel >= 1:        
            options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids) )
            sys.stdout.flush()
    else:
        map_pair2hids = None
        
    if options.loglevel >= 1:
        options.stdlog.write( "# finished input in %i seconds.\n" % (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None
        
    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links( sys.stdin ):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write( "# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1) )
                sys.stdout.flush()
                
        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write( "# read link %s\n" %  str(link) )
            
        row_seq = alignlib_lite.py_makeSequence( sequences[link.mQueryToken] )
        col_seq = alignlib_lite.py_makeSequence( sequences[link.mSbjctToken] )

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3 
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment( link.mQueryAli, 3 )
            link.mSbjctAli = ScaleAlignment( link.mSbjctAli, 3 )            
            
        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli,
            link.mSbjctFrom, link.mSbjctAli ).copy(  map_row2col )
        
        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write( "# combining in row with %s\n" %\
                                      str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mQueryToken].mMapOld2New ) ))

            alignlib_lite.py_combineAlignment( tmp1_map_row2col,
                                      map_old2new[link.mQueryToken].mMapOld2New,
                                      map_row2col,
                                      alignlib_lite.py_RR )
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col )

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()            
            if options.loglevel >= 3:
                options.stdlog.write( "# combining in col with %s\n" %\
                                      str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mSbjctToken].mMapOld2New ) ))

            alignlib_lite.py_combineAlignment( tmp1_map_row2col,
                                       map_row2col,
                                       map_old2new[link.mSbjctToken].mMapOld2New,
                                       alignlib_lite.py_CR )
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col )

        dr = row_seq.getLength() - map_row2col.getRowTo() 
        dc = col_seq.getLength() - map_row2col.getColTo() 
        if dr < 0 or dc < 0:
            raise ValueError("out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" %\
                                          (link.mQueryToken,
                                           link.mSbjctToken,
                                           row_seq.getLength(),
                                           col_seq.getLength(),
                                           str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))
            

        if options.loglevel >= 2:
            options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, 
                                                                         row_seq, 
                                                                         col_seq )) + "\n" )
        ## check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()
            
            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write( "# %s\n" % str(map_row2col) )
                options.stdlog.write( "# %s\n" % str(link) )
                options.stdlog.write( "# %s\n" % str(map_old2new[link.mQueryToken]) )
                options.stdlog.write( "# %s\n" % str(map_old2new[link.mSbjctToken]) )
                options.stdlog.write( "#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, 
                                                                                    row_seq,
                                                                                    col_seq ) )

                raise ValueError("incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken))

        ## if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            ## Get overlapping segments
            segments = Exons.MatchExons( map_row2col, exons1, exons2 )
            
            for a,b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in the input files.

                from1, to1 = GetAdjustedBoundaries( a, exons1 )
                from2, to2 = GetAdjustedBoundaries( b, exons2 )

                alignlib_lite.py_copyAlignment( tmp1_map_row2col, map_row2col,
                                       from1+1, to1, from2+1, to2 )
                
                mode = Write( tmp1_map_row2col, row_seq, col_seq, link,
                              no_gaps = options.no_gaps,
                              no_identical = options.no_identical,
                              min_length = options.min_length,
                              suffix1="_%s" % str(a),
                              suffix2="_%s" % str(b),
                              outfile = outfile,
                              pair_filter = map_pair2hid,
                              format = options.format )

                if mode not in counts: counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write( map_row2col, row_seq, col_seq, link,
                          min_length = options.min_length,                          
                          no_gaps = options.no_gaps,
                          no_identical = options.no_identical,
                          outfile = outfile,
                          pair_filter = map_pair2hids,
                          format = options.format )
            
            if mode not in counts: counts[mode] = 0
            counts[mode] += 1

        noutput += 1
        
    if outfile: outfile.close()
    
    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join( map( lambda x,y: "%s=%i" % (x,y), counts.keys(), counts.values() ) ))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped) )

    E.Stop()
Exemplo n.º 15
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--correct-gap-shift",
                      dest="correct_shift",
                      action="store_true",
                      help="correct gap length shifts in alignments. "
                      "Requires alignlib_lite.py [%default]")

    parser.add_option(
        "-1",
        "--pattern1",
        dest="pattern1",
        type="string",
        help="pattern to extract identifier from in identifiers1. "
        "[%default]")

    parser.add_option(
        "-2",
        "--pattern2",
        dest="pattern2",
        type="string",
        help="pattern to extract identifier from in identifiers2. "
        "[%default]")

    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("diff", "missed", "seqdiff"),
                      help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.Start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(IOTools.openFile(args[0], "r"))
    ])
    seqs2 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(IOTools.openFile(args[1], "r"))
    ])

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in seqs1:
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences: the first and last residues
                    # can be different for peptide sequences when
                    # comparing my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len(
                            filter(lambda x: x[0] == "U" or x[1] == "U",
                                   differences)) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len(
                            filter(lambda x: x[0] in "NX" or x[1] in "NX",
                                   differences)) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if options.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (
                            k, x, a, b, len(s1), len(s2))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print "fix\t%s\t%s" % (k, str(f))

                if not keep:
                    print "# warning: not fixable: %s" % k

            if write_diff:
                options.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in seqs2.keys():
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    options.stdlog.write("""# Legend:
# seqs1:          number of sequences in set 1
# seqs2:          number of sequences in set 2
# same:           number of identical sequences
# diff:           number of sequences with differences
# nmissed1:       sequences in set 1 that are not found in set 2
# nmissed2:       sequences in set 2 that are not found in set 1
# Type of sequence differences
# first:          only the first residue is different
# last:           only the last residue is different
# prefix:         one sequence is prefix of the other
# selenocysteine: difference due to selenocysteines
# masked:         difference due to masked residues
# fixed:          fixed differences
# other:          other differences
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))

    E.info(
        "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i"
        % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine,
           ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last -
           ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed))

    E.Stop()
Exemplo n.º 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("-e",
                      "--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.add_option(
        "-m",
        "--mode",
        dest="mode",
        type="choice",
        choices=("global", "local"),
        help="alignment mode, global=nw, local=sw [default=%default].")

    parser.set_defaults(
        gop=-12.0,
        gep=-2.0,
        format="fasta",
        mode="local",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info("read 2 multiple alignments")

    mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format)
    mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format)

    cmali1 = Mali.convertMali2Alignlib(mali1)
    cmali2 = Mali.convertMali2Alignlib(mali2)

    if options.mode == "local":
        mode = alignlib_lite.py_ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib_lite.py_ALIGNMENT_GLOBAL

    alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop,
                                                     options.gep)

    alignlib_lite.py_setDefaultEncoder(
        alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20))
    alignlib_lite.py_setDefaultLogOddor(
        alignlib_lite.py_makeLogOddorDirichlet(0.3))
    alignlib_lite.py_setDefaultRegularizor(
        alignlib_lite.py_makeRegularizorDirichletPrecomputed())

    cprofile1 = alignlib_lite.py_makeProfile(cmali1)
    cprofile2 = alignlib_lite.py_makeProfile(cmali2)

    result = alignlib_lite.py_makeAlignmentVector()

    alignator.align(result, cprofile1, cprofile2)

    E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result))

    cmali1.add(cmali2, result)

    outmali = Mali.convertAlignlib2Mali(cmali1,
                                        identifiers=mali1.getIdentifiers() +
                                        mali2.getIdentifiers())

    outmali.writeToFile(options.stdout, format=options.format)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 17
0
    def __str__( self ):
        """get a string representation of results."""

            
        if self.mExpand:
            if self.mMapPeptide2Translation.getLength() > 0:
                f = alignlib_lite.py_AlignmentFormatEmissions( self.mMapPeptide2Translation )
                row_ali, col_ali = f.mRowAlignment, f.mColAlignment
                self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom()
                self.mQueryTo = self.mMapPeptide2Translation.getRowTo()
                self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom()
                self.mSbjctTo = self.mMapPeptide2Translation.getColTo()
            else:
                row_ali, col_ali = "", ""
        else:
            row_ali = self.mQueryAli
            col_ali = self.mSbjctAli
            
        if self.mPredictionId:
            return string.join( map(str, (\
                self.mPredictionId,
                self.mQueryToken,
                self.mSbjctToken,
                self.mSbjctStrand,
                self.mRank,
                self.score,
                self.mQueryFrom,
                self.mQueryTo,
                row_ali,
                self.mSbjctFrom,
                self.mSbjctTo,
                col_ali,
                self.mQueryLength,
                self.mQueryCoverage, 
                self.mNGaps,
                self.mNFrameShifts,
                self.mNIntrons,
                self.mNSplits,
                self.mNStopCodons,
                "%5.2f" % self.mPercentIdentity,
                "%5.2f" % self.mPercentSimilarity,
                self.mTranslation,
                self.mSbjctGenomeFrom,
                self.mSbjctGenomeTo,
                self.mAlignmentString,
                self.mNAssembled,
                )), "\t")
        else:
            return string.join( map(str, (\
              self.mQueryToken,
              self.mSbjctToken,
              self.mSbjctStrand,
              self.mRank,
              self.score,
              self.mQueryFrom,
              self.mQueryTo,
              row_ali,
              self.mSbjctFrom,
              self.mSbjctTo,
              col_ali,
              self.mQueryLength,
              self.mQueryCoverage, 
              self.mNGaps,
              self.mNFrameShifts,
              self.mNIntrons,
              self.mNSplits,
              self.mNStopCodons,
                "%5.2f" % self.mPercentIdentity,
                "%5.2f" % self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom,
              self.mSbjctGenomeTo,
              self.mAlignmentString,
              self.mNAssembled,
              )), "\t")
Exemplo n.º 18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$")

    parser.add_option("-p",
                      "--peptides-fasta-file",
                      dest="filename_peptides",
                      type="string",
                      help="filename with peptide sequences [%default].")

    parser.add_option("-c",
                      "--cds-gtf-file",
                      "--cdnas",
                      dest="filename_cdna",
                      type="string",
                      help="filename with cdna sequences [%default].")

    parser.add_option(
        "-m",
        "--map",
        dest="filename_map",
        type="string",
        help=
        "filename with map of peptide identifiers to cdna identifiers [%default]."
    )

    parser.add_option("--output-identifier",
                      dest="output_identifier",
                      type="choice",
                      choices=("cdna", "peptide"),
                      help="output identifier to use [%default].")

    parser.add_option("-f",
                      "--output-format=",
                      dest="output_format",
                      type="choice",
                      choices=("alignment", "fasta"),
                      help="output format.")

    parser.set_defaults(
        peptides=None,
        filename_cdna=None,
        output_format="alignment",
        filename_map=None,
        stop_codons=("TAG", "TAA", "TGA"),
        output_identifier="peptide",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if not options.filename_cdna:
        raise ValueError("please supply filename with cds sequences.")

    if options.filename_peptides:
        infile = open(options.filename_peptides, "r")
        E.info("reading from %s" % options.filename_peptides)
    else:
        E.info("reading from stdin")
        infile = sys.stdin

    if options.filename_map:
        E.info("reading map")
        map_peptide2cds = IOTools.readMap(
            IOTools.openFile(options.filename_map, "r"))
        E.info("read map for %i identifiers" % len(map_peptide2cds))
    else:
        map_peptide2cds = {}

    E.info("reading cds sequences")

    cds_sequences = Genomics.ReadPeptideSequences(
        IOTools.openFile(options.filename_cdna, "r"))

    E.info("read %i cds sequences" % len(cds_sequences))

    ninput, noutput = 0, 0
    nskipped, nnosequence = 0, 0

    # iterate over peptide sequences
    iterator = FastaIterator.FastaIterator(infile)

    use_cds_id = options.output_identifier == "cds"

    for cur_record in iterator:

        ninput += 1

        peptide_identifier = re.split("\s+", cur_record.title)[0]
        cds_identifier = map_peptide2cds.get(peptide_identifier,
                                             peptide_identifier)

        if cds_identifier not in cds_sequences:
            nnosequence += 1
            continue

        p = cur_record.sequence
        c = cds_sequences[cds_identifier]

        E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" %
                (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c)))

        try:
            map_p2c = Peptides2Cds.getMapPeptide2Cds(p, c, options)
        except ValueError:
            nskipped += 1
            continue

        if use_cds_id:
            identifier = cds_identifier
        else:
            identifier = peptide_identifier

        if options.output_format == "alignment":
            options.stdout.write("\t".join(
                map(str, (identifier,
                          alignlib_lite.py_AlignmentFormatEmissions(map_p2c),
                          len(cur_record.sequence),
                          len(cds_sequences[identifier])))) + "\n")

        elif options.output_format == "fasta":

            map_p2c.switchRowCol()

            alignatum = alignlib_lite.py_makeAlignatum(c)

            alignatum.mapOnAlignment(map_p2c, len(p) * 3)

            s = alignatum.getString()
            if len(s) != len(p) * 3:
                raise ValueError(
                    "incomplete aligned string for %s: %s, cds=%s" %
                    (cur_record.title, s, c))

            options.stdout.write(">%s\n%s\n" % (identifier, s))

        noutput += 1
        sys.stdout.flush()

    E.info("ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" %
           (ninput, noutput, nnosequence, nskipped))

    E.Stop()
Exemplo n.º 19
0
    def __str__(self):
        """get a string representation of results."""

        if self.mExpand:
            if self.mMapPeptide2Translation.getLength() > 0:
                f = alignlib_lite.py_AlignmentFormatEmissions(
                    self.mMapPeptide2Translation)
                row_ali, col_ali = f.mRowAlignment, f.mColAlignment
                self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom()
                self.mQueryTo = self.mMapPeptide2Translation.getRowTo()
                self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom()
                self.mSbjctTo = self.mMapPeptide2Translation.getColTo()
            else:
                row_ali, col_ali = "", ""
        else:
            row_ali = self.mQueryAli
            col_ali = self.mSbjctAli

        if self.mPredictionId:
            return string.join(
                map(str, (
                    self.mPredictionId,
                    self.mQueryToken,
                    self.mSbjctToken,
                    self.mSbjctStrand,
                    self.mRank,
                    self.score,
                    self.mQueryFrom,
                    self.mQueryTo,
                    row_ali,
                    self.mSbjctFrom,
                    self.mSbjctTo,
                    col_ali,
                    self.mQueryLength,
                    self.mQueryCoverage,
                    self.mNGaps,
                    self.mNFrameShifts,
                    self.mNIntrons,
                    self.mNSplits,
                    self.mNStopCodons,
                    "%5.2f" % self.mPercentIdentity,
                    "%5.2f" % self.mPercentSimilarity,
                    self.mTranslation,
                    self.mSbjctGenomeFrom,
                    self.mSbjctGenomeTo,
                    self.mAlignmentString,
                    self.mNAssembled,
                )), "\t")
        else:
            return string.join(
                map(str, (
                    self.mQueryToken,
                    self.mSbjctToken,
                    self.mSbjctStrand,
                    self.mRank,
                    self.score,
                    self.mQueryFrom,
                    self.mQueryTo,
                    row_ali,
                    self.mSbjctFrom,
                    self.mSbjctTo,
                    col_ali,
                    self.mQueryLength,
                    self.mQueryCoverage,
                    self.mNGaps,
                    self.mNFrameShifts,
                    self.mNIntrons,
                    self.mNSplits,
                    self.mNStopCodons,
                    "%5.2f" % self.mPercentIdentity,
                    "%5.2f" % self.mPercentSimilarity,
                    self.mTranslation,
                    self.mSbjctGenomeFrom,
                    self.mSbjctGenomeTo,
                    self.mAlignmentString,
                    self.mNAssembled,
                )), "\t")
Exemplo n.º 20
0
    def Align( self, method, anchor = 0, loglevel = 1 ):
        """align a pair of sequences.
        get rid of this and use a method class instead in the future
        """
        
        map_a2b = alignlib_lite.py_makeAlignmentVector()
        s1 = "A" * anchor + self.mSequence1 + "A" * anchor
        s2 = "A" * anchor + self.mSequence2 + "A" * anchor    

        self.strand = "+"

        if method == "dialign":
            dialign = WrapperDialign.Dialign( self.mOptionsDialign )
            dialign.Align( s1, s2, map_a2b )
        elif method == "blastz":
            blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ )
            blastz.Align( s1, s2, map_a2b )
            if blastz.isReverseComplement():
                self.strand = "-"
                self.mSequence2 = Genomics.complement( self.mSequence2 )

        elif method == "dialignlgs":
            dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS )
            dialignlgs.Align( s1, s2, map_a2b ) 
        elif method == "dba":
            dba = WrapperDBA.DBA()
            dba.Align( s1, s2, map_a2b )
        elif method == "clustal":
            raise NotImplementedError( "clustal wrapper needs to be updated")
            clustal = WrapperClustal.Clustal()
            clustal.Align( s1, s2, map_a2b )
        elif method == "nw":
            seq1 = alignlib_lite.py_makeSequence( s1 )
            seq2 = alignlib_lite.py_makeSequence( s2 )
            alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_GLOBAL,
                                                      gop=-12.0,
                                                      gep=-2.0 )
            alignator.align( map_a2b, seq1, seq2 )
        elif method == "sw":                        
            seq1 = alignlib_lite.py_makeSequence( s1 )
            seq2 = alignlib_lite.py_makeSequence( s2 )
            alignlib_lite.py_performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw )
        else:
            ## use callback function
            method(s1, s2, map_a2b)

        if map_a2b.getLength() == 0:
            raise AlignmentError("empty alignment")

        if anchor:
            map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() )
            map_a2b.removeRowRegion( 1, anchor)        
            map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() )        
            map_a2b.removeColRegion( 1, anchor)
            map_a2b.moveAlignment( -anchor, -anchor )

        f = alignlib_lite.py_AlignmentFormatExplicit( map_a2b, 
                                              alignlib_lite.py_makeSequence( self.mSequence1),
                                              alignlib_lite.py_makeSequence( self.mSequence2) )

        self.mMethod = method
        self.mAlignment = map_a2b
        self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
        f = alignlib_lite.py_AlignmentFormatEmissions( map_a2b )
        self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
        self.mAlignmentFrom1 = map_a2b.getRowFrom()
        self.mAlignmentTo1 = map_a2b.getRowTo()        
        self.mAlignmentFrom2 = map_a2b.getColFrom()
        self.mAlignmentTo2 = map_a2b.getColTo()        
        self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
        self.mAligned = self.mLength - self.mNumGaps

        self.SetPercentIdentity()
        self.SetBlockSizes()
Exemplo n.º 21
0
            continue

        query_token, sbjct_token, query_sequence, sbjct_sequence = string.split(line[:-1], "\t")

        map_query2token.clear()
        row = alignlib_lite.py_makeSequence(query_sequence)
        col = alignlib_lite.py_makeSequence(sbjct_sequence)
        alignator.align(map_query2token, row, col)

        pidentity = 100.0 * alignlib_lite.py_calculatePercentIdentity(map_query2token, row, col)
        psimilarity = 100.0 * alignlib_lite.py_calculatePercentSimilarity(map_query2token)
        print string.join(
            map(
                str,
                (
                    query_token,
                    sbjct_token,
                    map_query2token.getScore(),
                    alignlib_lite.py_AlignmentFormatEmissions(map_query2token),
                    pidentity,
                    psimilarity,
                    map_query2token.getNumGaps(),
                ),
            ),
            "\t",
        )


if __name__ == "__main__":
    sys.exit(main(sys.argv))
Exemplo n.º 22
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: diff_fasta.py 2781 2009-09-10 11:33:14Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true",
                      help="correct gap length shifts in alignments. Requires alignlib_lite.py_ "
                      "[%default]")
    parser.add_option("-1", "--pattern1", dest="pattern1", type="string",
                      help="pattern to extract identifier from in identifiers1. "
                      "[%default]")
    parser.add_option("-2", "--pattern2", dest="pattern2", type="string",
                      help="pattern to extract identifier from in identifiers2. "
                      "[%default]")

    parser.add_option("-o", "--output", dest="output", type="choice", action="append",
                      choices=("diff", "missed", "seqdiff"),
                      help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.Start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ but alignlib not found")

    seqs1 = Genomics.ReadPeptideSequences(IOTools.openFile(args[0], "r"))
    seqs2 = Genomics.ReadPeptideSequences(IOTools.openFile(args[1], "r"))

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in seqs1:
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences:
                    # the first and last residues can be different for peptide sequences when comparing
                    # my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len(filter(lambda x: x[0] == "U" or x[1] == "U", differences)) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len(filter(lambda x: x[0] in "NX" or x[1] in "NX", differences)) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if options.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print "fix\t%s\t%s" % (k, str(f))

                if not keep:
                    print "# warning: not fixable: %s" % k

            if write_diff:
                options.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in seqs2.keys():
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    options.stdlog.write( """# Legend:
# seqs1:          number of sequences in set 1
# seqs2:          number of sequences in set 2
# same:           number of identical sequences
# diff:           number of sequences with differences
# nmissed1:       sequences in set 1 that are not found in set 2
# nmissed2:       sequences in set 2 that are not found in set 1
# Type of sequence differences
# first:          only the first residue is different
# last:           only the last residue is different
# prefix:         one sequence is prefix of the other
# selenocysteine: difference due to selenocysteines
# masked:         difference due to masked residues
# fixed:          fixed differences
# other:          other differences
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))
    E.info("ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" %
           (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed,
            ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed))

    E.Stop()
Exemplo n.º 23
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-s", "--correct-gap-shift", dest="correct_shift",
        action="store_true",
        help="correct gap length shifts in alignments. "
        "Requires alignlib_lite.py ")

    parser.add_argument(
        "-1", "--pattern1", dest="pattern1", type=str,
        help="pattern to extract identifier from in identifiers1. "
        )

    parser.add_argument(
        "-2", "--pattern2", dest="pattern2", type=str,
        help="pattern to extract identifier from in identifiers2. "
        )

    parser.add_argument(
        "-o", "--output-section", dest="output", type=str,
        action="append",
        choices=("diff", "missed", "seqdiff"),
        help="what to output ")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (args, unknown) = E.start(parser, unknowns=True)

    if len(unknown) != 2:
        raise ValueError("two files needed to compare.")

    if args.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            iotools.open_file(unknown[0], "r"))])
    seqs2 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            iotools.open_file(unknown[1], "r"))])

    if not seqs1:
        raise ValueError("first file %s is empty." % (unknown[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (unknown[1]))

    MapIdentifiers(seqs1, args.pattern1)
    MapIdentifiers(seqs2, args.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in args.output
    write_missed2 = "missed" in args.output
    write_seqdiff = "seqdiff" in args.output
    write_diff = "diff" in args.output or write_seqdiff

    for k in sorted(seqs1):
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                args.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences: the first and last residues
                    # can be different for peptide sequences when
                    # comparing my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len([x for x in differences if x[0] == "U" or x[1] == "U"]) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len([x for x in differences if x[0] in "NX" or x[1] in "NX"]) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if args.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print("# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2)))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print("fix\t%s\t%s" % (k, str(f)))

                if not keep:
                    print("# warning: not fixable: %s" % k)

            if write_diff:
                args.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                args.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in sorted(list(seqs2.keys())):
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                args.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    args.stdlog.write("""# Legend:
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))

    E.info(
        "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" %
        (ndiff, ndiff_first, ndiff_last, ndiff_prefix,
         ndiff_selenocysteine, ndiff_masked, nfixed,
         ndiff - ndiff_first - ndiff_last - ndiff_prefix -
         ndiff_selenocysteine - ndiff_masked - nfixed))

    E.stop()