示例#1
0
def readIntervals(infile, options):

    ninput = 0
    t = time.time()

    if options.format == "gtf":

        index = IndexedGenome.IndexedGenome()

        for gffs in GTF.transcript_iterator(GTF.iterator(infile)):

            ali = alignlib_lite.py_makeAlignmentBlocks()
            for gff in gffs:
                if gff.feature != "exon":
                    continue
                ali.addDiagonal(gff.start, gff.end, 0)

            index.add(min([x.start for x in gffs]),
                      max([x.end for x in gffs]),
                      ali)
            ninput += 1

            if ninput % options.report_step == 0:
                E.info(
                    "reading intervals - progress: ninput=%i, time=%i, avg=%f"
                    % (ninput,
                       time.time() - t, float(time.time() - t) / ninput))

    elif options.format == "gff":

        index = IndexedGenome.Simple()

        for g in GTF.iterator(infile):

            index.add(g.contig, g.start, g.end)
            ninput += 1

            if ninput % options.report_step == 0:
                E.info(
                    "reading intervals - progress: ninput=%i, time=%i, avg=%f"
                    % (ninput, time.time() - t,
                       float(time.time() - t) / ninput))

    E.info("read intervals: %i contigs, %i intervals" % (len(index), ninput))
    return index
示例#2
0
文件: psl2psl.py 项目: Q-KIM/cgat
def readIntervals(infile, options):

    ninput = 0
    t = time.time()

    if options.format == "gtf":

        index = IndexedGenome.IndexedGenome()

        for gffs in GTF.transcript_iterator(GTF.iterator(infile)):

            ali = alignlib_lite.py_makeAlignmentBlocks()
            for gff in gffs:
                if gff.feature != "exon":
                    continue
                ali.addDiagonal(gff.start, gff.end, 0)

            index.add(min([x.start for x in gffs]),
                      max([x.end for x in gffs]),
                      ali)
            ninput += 1

            if ninput % options.report_step == 0:
                E.info(
                    "reading intervals - progress: ninput=%i, time=%i, avg=%f"
                    % (ninput,
                       time.time() - t, float(time.time() - t) / ninput))

    elif options.format == "gff":

        index = IndexedGenome.Simple()

        for g in GTF.iterator(infile):

            index.add(g.contig, g.start, g.end)
            ninput += 1

            if ninput % options.report_step == 0:
                E.info(
                    "reading intervals - progress: ninput=%i, time=%i, avg=%f"
                    % (ninput, time.time() - t,
                       float(time.time() - t) / ninput))

    E.info("read intervals: %i contigs, %i intervals" % (len(index), ninput))
    return index
示例#3
0
    def getMapTarget2Query(self):
        """return a map between target to query.

        If the strand is "-", the coordinates for query are on
        the negative strand.
        """

        map_target2query = alignlib_lite.py_makeAlignmentBlocks()

        f = alignlib_lite.py_AlignmentFormatBlat(
            "%i\t%i\t%i\t%i\t%s\t%s\t%s\n" %
            (min(self.mSbjctBlockStarts), max(self.mSbjctBlockStarts),
             min(self.mQueryBlockStarts), max(self.mQueryBlockStarts),
             ",".join([str(x) for x in self.mSbjctBlockStarts]) + ",",
             ",".join([str(x) for x in self.mQueryBlockStarts]) + ",",
             ",".join([str(x) for x in self.mBlockSizes]) + ","))
        f.copy(map_target2query)
        return map_target2query
示例#4
0
文件: Blat.py 项目: BioXiao/cgat
    def getMapTarget2Query(self):
        """return a map between target to query.

        If the strand is "-", the coordinates for query are on
        the negative strand.
        """

        map_target2query = alignlib_lite.py_makeAlignmentBlocks()

        f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % (
            min(self.mSbjctBlockStarts),
            max(self.mSbjctBlockStarts),
            min(self.mQueryBlockStarts),
            max(self.mQueryBlockStarts),
            ",".join([str(x) for x in self.mSbjctBlockStarts]) + ",",
            ",".join([str(x) for x in self.mQueryBlockStarts]) + ",",
            ",".join([str(x) for x in self.mBlockSizes]) + ","))
        f.copy(map_target2query)
        return map_target2query
示例#5
0
    def fromPair(self,
                 query_start, query_size, query_strand, query_seq,
                 target_start, target_size, target_strand, target_seq):
        '''fill from two aligned sequences.

        Note that sequences are case-sensitive.'''

        self.mQueryLength = query_size
        self.mSbjctLength = target_size

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        assert len(query_seq) == len(target_seq)

        x, y = query_start, target_start
        nmatches, nmismatches = 0, 0
        for q, t in zip(query_seq, target_seq):
            tq, tt = q != "-", t != "-"
            if tq and tt:
                map_query2target.addPair(x, y)
                if q == t:
                    nmatches += 1
                else:
                    nmismatches += 1

            if tq:
                x += 1
            if tt:
                y += 1

        self.mNMatches, self.mNMismatches = nmatches, nmismatches
        self.strand = query_strand
        # the following call will set query_from, query_to for the forward strand
        # though block coordinates might be on the negative strand
        self.fromMap(map_query2target, use_strand=True)

        # if target is on negative strand, swop strands
        if target_strand == "-":
            # swap target strand - this will also swap the query strand
            self.switchTargetStrand()
示例#6
0
文件: Blat.py 项目: BioXiao/cgat
    def fromPair(self,
                 query_start, query_size, query_strand, query_seq,
                 target_start, target_size, target_strand, target_seq):
        '''fill from two aligned sequences.

        Note that sequences are case-sensitive.'''

        self.mQueryLength = query_size
        self.mSbjctLength = target_size

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        assert len(query_seq) == len(target_seq)

        x, y = query_start, target_start
        nmatches, nmismatches = 0, 0
        for q, t in zip(query_seq, target_seq):
            tq, tt = q != "-", t != "-"
            if tq and tt:
                map_query2target.addPair(x, y)
                if q == t:
                    nmatches += 1
                else:
                    nmismatches += 1

            if tq:
                x += 1
            if tt:
                y += 1

        self.mNMatches, self.mNMismatches = nmatches, nmismatches
        self.strand = query_strand
        # the following call will set query_from, query_to for the forward strand
        # though block coordinates might be on the negative strand
        self.fromMap(map_query2target, use_strand=True)

        # if target is on negative strand, swop strands
        if target_strand == "-":
            # swap target strand - this will also swap the query strand
            self.switchTargetStrand()
示例#7
0
文件: gtf2alleles.py 项目: SCV/cgat
    def _buildAllele(allele_id,
                     transcript, exons,
                     introns, offsets,
                     virtual_coordinates=False,
                     reference_exons=None):

        def _getOffset(pos, offsets):
            x = 0
            while x < len(offsets) and offsets[x][0] <= pos:
                x += 1
            x -= 1
            if x >= 0:
                return offsets[x][1]
            else:
                return 0

        def _sumIndels(ss):
            '''sum indels within ss'''
            c = 0
            for s in ss:
                c += len(s) - 1
            return c

        def _getEndOffsets(ss):
            '''get the offset at exons due to deletions at
            start/end of exon.'''
            l = len(ss)
            x = 0
            while x < l and ss[x] == "":
                x += 1
            start_offset = x

            x = l - 1
            while x >= 0 and ss[x] == "":
                x -= 1
            if x >= 0:
                return start_offset, (l - 1) - x
            else:
                return start_offset, 0

        def _addCds2Reference(map_cds2reference,
                              cds_start,
                              cds_seq,
                              reference_start):
            '''add cds to reference'''
            c, r = cds_start, reference_start
            for x in cds_seq:
                l = len(x)
                if l == 0:
                    r += 1
                else:
                    map_cds2reference.addPair(c, r)
                    c += l
                    r += 1
        # counts
        is_splice_truncated = False
        is_nmd_knockout = False
        is_stop_truncated = False
        nuncorrected_frameshifts = 0
        ncorrected_frameshifts = 0
        nframeshifts = 0
        nsplice_noncanonical = 0
        reference_first_stop_start = -1
        reference_first_stop_end = -1

        # map between the new cds sequence and the reference
        # sequence
        map_cds2reference = alignlib_lite.py_makeAlignmentBlocks()

        ###################################################
        # process first exon
        exon = transcript[0]
        transcript_id = exon.transcript_id

        # collect offset for exon.start
        genome_start = exon.start
        genome_start += _getOffset(genome_start, offsets)
        lcds, cds = 0, []
        cds_starts = [0]

        # still need to deal with deletions of first base:
        exon_starts = [genome_start]
        exon_key = (exon.start, exon.end)
        exon_sequence = exons[exon_key]
        exon_seq = "".join(exon_sequence)

        cds.append(exon_seq)
        _addCds2Reference(map_cds2reference,
                          lcds,
                          exon_sequence,
                          exon.start)
        lcds = len(exon_seq)

        if len(exon_seq) != exon.end - exon.start:
            nframeshifts += 1

        # add first exon to genome position
        genome_pos = genome_start + len(exon_seq)
        last_end = exon.end

        # correct for deletions at start/end of exon
        start_offset, end_offset = _getEndOffsets(exon_sequence)

        # length of original transcript
        loriginal = sum([x.end - x.start for x in transcript])

        if E.global_options.loglevel >= 8:
            print "%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)
            for x, c in enumerate(exons[exon_key]):
                if len(c) != 1:
                    print x + exon.start, ":%s:" % c
            print
            print exons[exon_key]
            print "genome_pos=", genome_pos, \
                ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), \
                ", len(exon_seq)=", len(exon_seq), \
                ", len(exon)=", exon.end - exon.start, \
                ", offsets=%i,%i," % (start_offset, end_offset), \
                ", offset at start=", _getOffset( exon.start, offsets), \
                ", offset at end=", _getOffset(exon.end, offsets)

        for exon in transcript[1:]:

            last_exon_sequence = exon_sequence
            last_start_offset, last_end_offset = start_offset, end_offset

            # get the next intron/exon parameters
            exon_key = (exon.start, exon.end)
            exon_sequence = exons[exon_key]
            start_offset, end_offset = _getEndOffsets(exon_sequence)
            intron_key = (last_end, exon.start)

            if last_end == exon.start:
                # catch empty introns
                intron_sequence = []
                intron_key = None
            else:
                intron_sequence = introns[intron_key]

            intron_seq = "".join(intron_sequence)

            ###################################################
            ###################################################
            ###################################################
            # add preceding intron
            new_exon = True

            if len(intron_seq) > frameshiftsize:

                intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType(
                    intron_seq)
                if intron_name == "unknown":
                    if intron_seq[:2].islower() and intron_seq[-2:].islower():
                        E.debug("%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s" %
                                (transcript_id, intron_name, intron_seq5, intron_seq3))
                        nsplice_noncanonical += 1
                    else:
                        is_splice_truncated = True
                        E.debug("%s: transcript has splice truncated allele: %s: %s:%s" %
                                (transcript_id, intron_name, intron_seq5, intron_seq3))
                        break
                # start a new exon
                cds_starts.append(lcds)

            else:
                # treat as frameshifting intron
                #
                # frame-shifting introns are checked if they are
                # fixed by indels either in the intron itself or
                # the terminal exon sequence. To this end, the effective
                # size of the intron is computed:
                # effective size of intron =
                # indels at terminal x bases at previous exon
                # + size of intron
                # + indels at terminal x bases at next exon
                effective_intron_size = len(intron_seq)
                previous_indels = _sumIndels(
                    last_exon_sequence[max(0, -frameshiftsize):])
                next_indels = _sumIndels(exon_sequence[:frameshiftsize])
                effective_intron_size += previous_indels + next_indels

                if previous_indels + next_indels == 0 and len(intron_seq) % 3 == 0:
                    has_stop = "X" in Genomics.translate(intron_seq.upper(),
                                                         is_seleno=is_seleno)
                else:
                    has_stop = False

                if effective_intron_size % 3 == 0 and not has_stop:
                    E.debug("%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)" %
                            (transcript_id, last_end, exon.start,
                             effective_intron_size,
                             len(intron_seq),
                             previous_indels, next_indels,))

                    # add to previous exon
                    cds.append(intron_seq)
                    lcds += len(intron_seq)
                    ncorrected_frameshifts += 1
                    new_exon = False
                else:
                    E.debug("%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)" %
                            (transcript_id, last_end, exon.start,
                             effective_intron_size,
                             len(intron_seq),
                             previous_indels, next_indels,
                             has_stop))

                    nuncorrected_frameshifts += 1
                    # start a new exon
                    cds_starts.append(lcds)

            if E.global_options.loglevel >= 8:
                print "%i: intron_indels (%i-%i):" % (allele_id, last_end, exon.start)
                if intron_key:
                    for x, c in enumerate(introns[intron_key]):
                        if len(c) != 1:
                            print x + last_end, ":%s:" % c
                    print
                    print introns[intron_key]
                    print "genome_pos=", genome_pos, \
                        ",intron=%i-%i" % (genome_pos, genome_pos + len(intron_seq)), \
                        ", len(intron_seq)=", len(intron_seq), \
                        ", len(intron)=", exon.start - last_end, \
                        ", offset at start=", _getOffset( last_end, offsets), \
                        ", offset at end=", _getOffset(exon.start, offsets)
                else:
                    print "empty intron"

            genome_pos += len(intron_seq)

            # assertion - check if genomic coordinate of intron is consistent
            # with offset
            test_offset = _getOffset(exon.start, offsets)
            is_offset = genome_pos - exon.start
            assert is_offset == test_offset, "intron offset difference: %i != %i" % (
                is_offset, test_offset)

            ###################################################
            ###################################################
            ###################################################
            # add the exon
            exon_seq = "".join(exon_sequence)
            cds.append(exon_seq)

            if len(exon_seq) != exon.end - exon.start:
                nframeshifts += 1

            if new_exon:
                if reference_coordinates:
                    exon_starts.append(exon.start + start_offset)
                else:
                    exon_starts.append(genome_pos)

            _addCds2Reference(map_cds2reference,
                              lcds,
                              exon_sequence,
                              exon.start)

            lcds += len(exon_seq)
            last_end = exon.end

            if E.global_options.loglevel >= 8:
                print "%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)
                for x, c in enumerate(exons[exon_key]):
                    if len(c) != 1:
                        print x + exon.start, ":%s:" % c
                print
                print exons[exon_key]
                print "genome_pos=", genome_pos, \
                    ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), \
                    ", len(exon_seq)=", len(exon_seq), \
                    ", len(exon)=", exon.end - exon.start, \
                    ", offsets=%i,%i," % (start_offset, end_offset), \
                    ", offset at start=", _getOffset( exon.start, offsets), \
                    ", offset at end=", _getOffset(exon.end, offsets)

            genome_pos += len(exon_seq)

            test_offset = _getOffset(exon.end, offsets)
            is_offset = genome_pos - exon.end
            assert is_offset == test_offset, "exon offset difference: %i != %i" % (
                is_offset, test_offset)

        cds = "".join(cds)
        assert lcds == len(cds)

        # fix incomplete codons at the end of the sequence
        if lcds % 3 != 0:
            offset = lcds % 3
            cds = cds[:-offset]

        # add frame correction for transcripts that do not start at frame=0
        start_frame = (3 - (int(transcript[0].frame) % 3)) % 3

        # n are ignored (? in sequence to deal with genes like Muc2)
        peptide = Genomics.translate("n" * start_frame + cds,
                                     is_seleno=is_seleno,
                                     prefer_lowercase=False,
                                     ignore_n=True)

        # find the first stop codon
        if start_frame != 0:
            # ignore first, potentially incomplete base
            pep_first_stop = peptide.upper().find("X", 1)
        else:
            pep_first_stop = peptide.upper().find("X")

        E.debug("%s: translated peptide = %s, first stop at %i" %
                (transcript_id, peptide, pep_first_stop))

        peptide = peptide.replace("?", "x")

        if E.global_options.loglevel >= 8:
            E.debug("peptide=%s" % peptide)
            E.debug("cds=%s" % cds)

        E.debug("%s: start_frame=%i, first stop at %i/%i" % (transcript_id,
                                                             start_frame,
                                                             pep_first_stop,
                                                             len(peptide)))

        lpeptide, lcds = len(peptide), len(cds)

        # check for non-sense mediated decay
        if pep_first_stop != -1:
            cds_first_stop = pep_first_stop * 3 - start_frame
            if cds_first_stop < cds_starts[-1]:
                if ncorrected_frameshifts or nuncorrected_frameshifts:
                    E.warn("nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected" %
                           (transcript_id,
                            ncorrected_frameshifts,
                            nuncorrected_frameshifts))
                is_nmd_knockout = True
                cds = peptide = ""
                lpeptide, lcds = 0, 0
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            elif pep_first_stop < len(peptide) - 1:
                is_stop_truncated = True
                cds = cds[:cds_first_stop]
                peptide[:pep_first_stop]
                lpeptide, lcds = len(peptide), len(cds)
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            else:
                E.warn("first stop at %i(cds=%i) ignored: last exon start at %i" %
                       (pep_first_stop,
                        cds_first_stop,
                        cds_starts[-1]))

        else:
            # -1 for no stop codon found
            pep_first_stop = -1
            cds_first_stop = -1
            lpeptide, lcds = len(peptide), len(cds)

        if peptide is None and nframeshifts == 0:
            E.warn(
                "transcript %s is knockout, though there are no indels - must be nonsense mutation" % (transcript_id))

        # build frames
        frames = [start_frame]
        start = start_frame
        l = 0
        for end in cds_starts[1:]:
            l += end - start
            frames.append((3 - l % 3) % 3)
            start = end

        return Allele._make((cds,
                             peptide,
                             len(cds_starts),
                             cds_starts,
                             exon_starts,
                             frames,
                             is_nmd_knockout,
                             is_splice_truncated,
                             is_stop_truncated,
                             nframeshifts,
                             ncorrected_frameshifts,
                             nuncorrected_frameshifts,
                             pep_first_stop,
                             lpeptide,
                             cds_first_stop,
                             lcds,
                             reference_first_stop_start,
                             reference_first_stop_end,
                             loriginal,
                             nsplice_noncanonical,
                             )), map_cds2reference
示例#8
0
文件: gff2psl.py 项目: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header", dest="with_header", action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""  )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin),
                                                                 feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend,
                               start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gff.transcript_id
        entry.mSbjctId = gff.contig
        entry.strand = gff.strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
示例#9
0
    def _buildAllele(allele_id,
                     transcript,
                     exons,
                     introns,
                     offsets,
                     virtual_coordinates=False,
                     reference_exons=None):
        def _getOffset(pos, offsets):
            x = 0
            while x < len(offsets) and offsets[x][0] <= pos:
                x += 1
            x -= 1
            if x >= 0:
                return offsets[x][1]
            else:
                return 0

        def _sumIndels(ss):
            '''sum indels within ss'''
            c = 0
            for s in ss:
                c += len(s) - 1
            return c

        def _getEndOffsets(ss):
            '''get the offset at exons due to deletions at
            start/end of exon.'''
            l = len(ss)
            x = 0
            while x < l and ss[x] == "":
                x += 1
            start_offset = x

            x = l - 1
            while x >= 0 and ss[x] == "":
                x -= 1
            if x >= 0:
                return start_offset, (l - 1) - x
            else:
                return start_offset, 0

        def _addCds2Reference(map_cds2reference, cds_start, cds_seq,
                              reference_start):
            '''add cds to reference'''
            c, r = cds_start, reference_start
            for x in cds_seq:
                l = len(x)
                if l == 0:
                    r += 1
                else:
                    map_cds2reference.addPair(c, r)
                    c += l
                    r += 1

        # counts
        is_splice_truncated = False
        is_nmd_knockout = False
        is_stop_truncated = False
        nuncorrected_frameshifts = 0
        ncorrected_frameshifts = 0
        nframeshifts = 0
        nsplice_noncanonical = 0
        reference_first_stop_start = -1
        reference_first_stop_end = -1

        # map between the new cds sequence and the reference
        # sequence
        map_cds2reference = alignlib_lite.py_makeAlignmentBlocks()

        ###################################################
        # process first exon
        exon = transcript[0]
        transcript_id = exon.transcript_id

        # collect offset for exon.start
        genome_start = exon.start
        genome_start += _getOffset(genome_start, offsets)
        lcds, cds = 0, []
        cds_starts = [0]

        # still need to deal with deletions of first base:
        exon_starts = [genome_start]
        exon_key = (exon.start, exon.end)
        exon_sequence = exons[exon_key]
        exon_seq = "".join(exon_sequence)

        cds.append(exon_seq)
        _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start)
        lcds = len(exon_seq)

        if len(exon_seq) != exon.end - exon.start:
            nframeshifts += 1

        # add first exon to genome position
        genome_pos = genome_start + len(exon_seq)
        last_end = exon.end

        # correct for deletions at start/end of exon
        start_offset, end_offset = _getEndOffsets(exon_sequence)

        # length of original transcript
        loriginal = sum([x.end - x.start for x in transcript])

        if E.global_options.loglevel >= 8:
            print("%i: exon_indels (%i-%i):" %
                  (allele_id, exon.start, exon.end))
            for x, c in enumerate(exons[exon_key]):
                if len(c) != 1:
                    print(x + exon.start, ":%s:" % c)
            print()
            print(exons[exon_key])
            print("genome_pos=", genome_pos,
                  ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)),
                  ", len(exon_seq)=", len(exon_seq), ", len(exon)=",
                  exon.end - exon.start,
                  ", offsets=%i,%i," % (start_offset, end_offset),
                  ", offset at start=", getOffset(exon.start,
                                                  offsets), ", offset at end=",
                  getOffset(exon.end, offsets))

        for exon in transcript[1:]:

            last_exon_sequence = exon_sequence
            last_start_offset, last_end_offset = start_offset, end_offset

            # get the next intron/exon parameters
            exon_key = (exon.start, exon.end)
            exon_sequence = exons[exon_key]
            start_offset, end_offset = _getEndOffsets(exon_sequence)
            intron_key = (last_end, exon.start)

            if last_end == exon.start:
                # catch empty introns
                intron_sequence = []
                intron_key = None
            else:
                intron_sequence = introns[intron_key]

            intron_seq = "".join(intron_sequence)

            ###################################################
            ###################################################
            ###################################################
            # add preceding intron
            new_exon = True

            if len(intron_seq) > frameshiftsize:

                intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType(
                    intron_seq)
                if intron_name == "unknown":
                    if intron_seq[:2].islower() and intron_seq[-2:].islower():
                        E.debug(
                            "%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s"
                            % (transcript_id, intron_name, intron_seq5,
                               intron_seq3))
                        nsplice_noncanonical += 1
                    else:
                        is_splice_truncated = True
                        E.debug(
                            "%s: transcript has splice truncated allele: %s: %s:%s"
                            % (transcript_id, intron_name, intron_seq5,
                               intron_seq3))
                        break
                # start a new exon
                cds_starts.append(lcds)

            else:
                # treat as frameshifting intron
                #
                # frame-shifting introns are checked if they are
                # fixed by indels either in the intron itself or
                # the terminal exon sequence. To this end, the effective
                # size of the intron is computed:
                # effective size of intron =
                # indels at terminal x bases at previous exon
                # + size of intron
                # + indels at terminal x bases at next exon
                effective_intron_size = len(intron_seq)
                previous_indels = _sumIndels(
                    last_exon_sequence[max(0, -frameshiftsize):])
                next_indels = _sumIndels(exon_sequence[:frameshiftsize])
                effective_intron_size += previous_indels + next_indels

                if previous_indels + next_indels == 0 and len(
                        intron_seq) % 3 == 0:
                    has_stop = "X" in Genomics.translate(intron_seq.upper(),
                                                         is_seleno=is_seleno)
                else:
                    has_stop = False

                if effective_intron_size % 3 == 0 and not has_stop:
                    E.debug(
                        "%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)"
                        % (
                            transcript_id,
                            last_end,
                            exon.start,
                            effective_intron_size,
                            len(intron_seq),
                            previous_indels,
                            next_indels,
                        ))

                    # add to previous exon
                    cds.append(intron_seq)
                    lcds += len(intron_seq)
                    ncorrected_frameshifts += 1
                    new_exon = False
                else:
                    E.debug(
                        "%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)"
                        % (transcript_id, last_end, exon.start,
                           effective_intron_size, len(intron_seq),
                           previous_indels, next_indels, has_stop))

                    nuncorrected_frameshifts += 1
                    # start a new exon
                    cds_starts.append(lcds)

            if E.global_options.loglevel >= 8:
                print("%i: intron_indels (%i-%i):" %
                      (allele_id, last_end, exon.start))
                if intron_key:
                    for x, c in enumerate(introns[intron_key]):
                        if len(c) != 1:
                            print(x + last_end, ":%s:" % c)
                    print()
                    print(introns[intron_key])
                    print(
                        "genome_pos=", genome_pos, ",intron=%i-%i" %
                        (genome_pos, genome_pos + len(intron_seq)),
                        ", len(intron_seq)=", len(intron_seq),
                        ", len(intron)=",
                        exon.start - last_end, ", offset at start=",
                        _getOffset(last_end, offsets), ", offset at end=",
                        _getOffset(exon.start, offsets))
                else:
                    print("empty intron")

            genome_pos += len(intron_seq)

            # assertion - check if genomic coordinate of intron is consistent
            # with offset
            test_offset = _getOffset(exon.start, offsets)
            is_offset = genome_pos - exon.start
            assert is_offset == test_offset, "intron offset difference: %i != %i" % (
                is_offset, test_offset)

            ###################################################
            ###################################################
            ###################################################
            # add the exon
            exon_seq = "".join(exon_sequence)
            cds.append(exon_seq)

            if len(exon_seq) != exon.end - exon.start:
                nframeshifts += 1

            if new_exon:
                if reference_coordinates:
                    exon_starts.append(exon.start + start_offset)
                else:
                    exon_starts.append(genome_pos)

            _addCds2Reference(map_cds2reference, lcds, exon_sequence,
                              exon.start)

            lcds += len(exon_seq)
            last_end = exon.end

            if E.global_options.loglevel >= 8:
                print("%i: exon_indels (%i-%i):" %
                      (allele_id, exon.start, exon.end))
                for x, c in enumerate(exons[exon_key]):
                    if len(c) != 1:
                        print(x + exon.start, ":%s:" % c)
                print()
                print(exons[exon_key])
                print("genome_pos=", genome_pos,
                      ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)),
                      ", len(exon_seq)=", len(exon_seq), ", len(exon)=",
                      exon.end - exon.start, ", offsets=%i,%i," %
                      (start_offset, end_offset), ", offset at start=",
                      getOffset(exon.start, offsets), ", offset at end=",
                      getOffset(exon.end, offsets))

            genome_pos += len(exon_seq)

            test_offset = _getOffset(exon.end, offsets)
            is_offset = genome_pos - exon.end
            assert is_offset == test_offset, "exon offset difference: %i != %i" % (
                is_offset, test_offset)

        cds = "".join(cds)
        assert lcds == len(cds)

        # fix incomplete codons at the end of the sequence
        if lcds % 3 != 0:
            offset = lcds % 3
            cds = cds[:-offset]

        # add frame correction for transcripts that do not start at frame=0
        start_frame = (3 - (int(transcript[0].frame) % 3)) % 3

        # n are ignored (? in sequence to deal with genes like Muc2)
        peptide = Genomics.translate("n" * start_frame + cds,
                                     is_seleno=is_seleno,
                                     prefer_lowercase=False,
                                     ignore_n=True)

        # find the first stop codon
        if start_frame != 0:
            # ignore first, potentially incomplete base
            pep_first_stop = peptide.upper().find("X", 1)
        else:
            pep_first_stop = peptide.upper().find("X")

        E.debug("%s: translated peptide = %s, first stop at %i" %
                (transcript_id, peptide, pep_first_stop))

        peptide = peptide.replace("?", "x")

        if E.global_options.loglevel >= 8:
            E.debug("peptide=%s" % peptide)
            E.debug("cds=%s" % cds)

        E.debug("%s: start_frame=%i, first stop at %i/%i" %
                (transcript_id, start_frame, pep_first_stop, len(peptide)))

        lpeptide, lcds = len(peptide), len(cds)

        # check for non-sense mediated decay
        if pep_first_stop != -1:
            cds_first_stop = pep_first_stop * 3 - start_frame
            if cds_first_stop < cds_starts[-1]:
                if ncorrected_frameshifts or nuncorrected_frameshifts:
                    E.warn(
                        "nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected"
                        % (transcript_id, ncorrected_frameshifts,
                           nuncorrected_frameshifts))
                is_nmd_knockout = True
                cds = peptide = ""
                lpeptide, lcds = 0, 0
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            elif pep_first_stop < len(peptide) - 1:
                is_stop_truncated = True
                cds = cds[:cds_first_stop]
                peptide[:pep_first_stop]
                lpeptide, lcds = len(peptide), len(cds)
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            else:
                E.warn(
                    "first stop at %i(cds=%i) ignored: last exon start at %i" %
                    (pep_first_stop, cds_first_stop, cds_starts[-1]))

        else:
            # -1 for no stop codon found
            pep_first_stop = -1
            cds_first_stop = -1
            lpeptide, lcds = len(peptide), len(cds)

        if peptide is None and nframeshifts == 0:
            E.warn(
                "transcript %s is knockout, though there are no indels - must be nonsense mutation"
                % (transcript_id))

        # build frames
        frames = [start_frame]
        start = start_frame
        l = 0
        for end in cds_starts[1:]:
            l += end - start
            frames.append((3 - l % 3) % 3)
            start = end

        return Allele._make((
            cds,
            peptide,
            len(cds_starts),
            cds_starts,
            exon_starts,
            frames,
            is_nmd_knockout,
            is_splice_truncated,
            is_stop_truncated,
            nframeshifts,
            ncorrected_frameshifts,
            nuncorrected_frameshifts,
            pep_first_stop,
            lpeptide,
            cds_first_stop,
            lcds,
            reference_first_stop_start,
            reference_first_stop_end,
            loriginal,
            nsplice_noncanonical,
        )), map_cds2reference
示例#10
0
文件: chain2psl.py 项目: yangjl/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $",
        usage=globals()["__doc__"])

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    psl = None

    def chain_iterator(infile):
        lines = []
        for line in options.stdin:

            if line.startswith("#"): continue
            if line.strip() == "": continue
            if line.startswith("chain"):
                if lines: yield lines
                lines = []
            lines.append(line)

        yield lines

    for lines in chain_iterator(options.stdin):

        ninput += 1
        psl = Blat.Match()

        (_, _, psl.mSbjctId, target_length, target_strand, target_start,
         target_end, psl.mQueryId, query_length, query_strand, query_start,
         query_end, alignment_id) = lines[0][:-1].split()

        ( psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength,
          psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength ) = \
        [ int(x) for x in
          (query_start,
           query_end,
           query_length,
           target_start,
           target_end,
           target_length) ]

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        qstart, tstart = psl.mQueryStart, psl.mSbjctStart

        for line in lines[1:-1]:
            size, dt, dq = [int(x) for x in line[:-1].split()]
            map_query2target.addDiagonal(qstart, qstart + size,
                                         tstart - qstart)
            qstart += size + dq
            tstart += size + dt

        size = int(lines[-1][:-1])

        map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart)

        psl.fromMap(map_query2target)

        # sort out strand
        # target_strand is always positive
        assert (target_strand == "+")

        # if query strand is negative
        if query_strand == "-":
            # invert both query and target
            psl.switchTargetStrand()
            # manually invert the query coordinates
            psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom

        options.stdout.write("%s\n" % psl)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    ## write footer and output benchmark information.
    E.Stop()
示例#11
0
    def process(matches):

        new = matches[0].copy()

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        graph = networkx.DiGraph()
        graph.add_nodes_from(range(len(matches) + 2))

        matches.sort(key=lambda x: x.mQueryFrom)

        if Genomics.IsPositiveStrand(matches[0].strand):
            f = lambda x, y: x.mSbjctTo < y.mSbjctFrom
        else:
            f = lambda x, y: x.mSbjctFrom > y.mSbjctTo

        for x in range(0, len(matches)):

            xx = matches[x]
            if options.loglevel >= 6:
                options.stdlog.write("# graph: %2i %s\n" % (x, str(xx)))

            for y in range(x + 1, len(matches)):
                yy = matches[y]
                d = min(xx.mQueryTo, yy.mQueryTo) - \
                    max(xx.mQueryFrom, yy.mQueryFrom)
                if d > 0 or not f(xx, yy):
                    continue
                else:
                    graph.add_edge(x, y, {'weight': -d})

        source = len(matches)
        target = len(matches) + 1
        for x in range(len(matches)):
            xx = matches[x]
            graph.add_edge(source, x, {'weight': xx.mQueryFrom})
            graph.add_edge(
                x, target, {'weight': xx.mQueryLength - xx.mQueryTo})

        if options.loglevel >= 6:
            networkx.write_edgelist(graph, options.stdlog)

        path = networkx.dijkstra_path(graph, source, target)

        if options.loglevel >= 6:
            options.stdlog.write("# path: %s\n" % (str(path)))

        new_matches = [matches[x] for x in path[1:-1]]

        if len(matches) != len(new_matches):
            E.warn(("query=%s, target=%s, strand=%s: "
                    "removed overlapping/out-of-order segments: "
                    "before=%i, after=%i") %
                   (matches[0].mQueryId,
                    matches[0].mSbjctId,
                    matches[0].strand,
                    len(matches),
                    len(new_matches)))

        matches = new_matches

        for match in matches:
            m = match.getMapQuery2Target()
            alignlib_lite.py_addAlignment2Alignment(map_query2target, m)

        new.fromMap(map_query2target, use_strand=True)

        options.stdout.write(str(new) + "\n")
        options.stdout.flush()
        return 1
示例#12
0
def pslMap(options):
    """thread psl alignments using intervals.

    """

    if options.format == "gtf":
        use_copy = False
    else:
        use_copy = True

    c = E.Counter()

    min_length = options.min_aligned

    for match, qx, tx in iterator_psl_intervals(options):

        map_query2target = match.getMapQuery2Target()

        c.input += 1

        # if no filter on qx or tx, use full segment
        if qx is None:
            qx = [(match.mQueryFrom, match.mQueryTo, 0)]
        elif tx is None:
            tx = [(match.mSbjctFrom, match.mSbjctTo, 0)]

        E.debug('matches in query: %s' % qx)
        E.debug('matches in target: %s' % tx)

        # if no overlap: return
        if not qx or not tx:
            c.skipped += 1
            E.debug("no matches in query or target - skipped")
            continue

        for query in qx:

            qstart, qend, qval = query

            # skip elements that are too small
            if qend - qstart < min_length:
                E.debug("query too small - skipped at %s:%i-%i" %
                        (match.mQueryId, qstart, qend))
                c.skipped_small_queries += 1
                continue

            E.debug("working on query %s:%i-%i" %
                    (match.mQueryId, qstart, qend))

            mqstart, mqend = (
                map_query2target.mapRowToCol(
                    qstart,
                    alignlib_lite.py_RIGHT),
                map_query2target.mapRowToCol(
                    qend,
                    alignlib_lite.py_LEFT))

            if match.strand == "-":
                qstart, qend = match.mQueryLength - \
                    qend, match.mQueryLength - qstart

            for target in tx:

                tstart, tend, tval = target
                if (tstart >= mqend or tend <= mqstart):
                    E.debug("no overlap: %i-%i (%i-%i) - %i-%i" % (
                        qstart, qend, mqstart, mqend, tstart, tend))
                    continue
                if tend - tstart < min_length:
                    E.debug("target length too short: %i-%i - %i-%i" % (
                        qstart, qend, tstart, tend))
                    continue

                new = alignlib_lite.py_makeAlignmentBlocks()

                if use_copy:
                    # do copy with range filter
                    if options.loglevel >= 3:

                        mtstart, mtend = map_query2target.mapColToRow(
                            tstart), map_query2target.mapColToRow(tend)

                        E.debug(
                            ("query: %i-%i (len=%i)-> %i-%i(len=%i); "
                             "target: %i-%i (len=%i)-> %i-%i (len=%i)") %
                            (qstart, qend,
                             qend - qstart,
                             mqstart, mqend,
                             mqend - mqstart,
                             tstart, tend,
                             tend - tstart,
                             mtstart, mtend,
                             mtend - mtstart))

                    alignlib_lite.py_copyAlignment(
                        new,
                        map_query2target,
                        qstart, qend,
                        tstart, tend)
                else:
                    # do copy with alignment filter
                    map_query = qval
                    if map_query:
                        tmp = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            tmp, map_query2target, map_query,
                            alignlib_lite.py_RR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping query ###########\n")
                            options.stdlog.write(
                                "# %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        map_query)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        tmp)))
                    else:
                        tmp = map_query2target

                    map_target = tval
                    if map_target:
                        new = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            new, tmp, map_target, alignlib_lite.py_CR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping target ###########\n")
                            options.stdlog.write(
                                "# before: %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    tmp)))
                            options.stdlog.write(
                                "# map   : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_target)))
                            options.stdlog.write(
                                "# after : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))
                    else:
                        new = tmp

                if options.loglevel >= 4:
                    E.debug("putative match with intervals: %s and %s: %i-%i" %
                            (str(query), str(target), qstart, qend))
                    if options.loglevel >= 5:
                        E.debug(
                            "input : %s" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                        E.debug("final : %s" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))

                    if new.getLength() > 0:
                        n = match.copy()
                        n.fromMap(new, use_strand=True)
                        E.info("match : %s" % (str(n)))

                if new.getNumAligned() > options.min_aligned:
                    n = match.copy()
                    n.fromMap(new, use_strand=True)
                    options.stdout.write(str(n) + "\n")
                    c.output += 1
                else:
                    c.discarded += 1
                break
            else:
                c.nooverlap += 1

    E.info("map: %s" % str(c))
示例#13
0
文件: psl2psl.py 项目: Q-KIM/cgat
    def process(matches):

        new = matches[0].copy()

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        graph = networkx.DiGraph()
        graph.add_nodes_from(xrange(len(matches) + 2))

        matches.sort(key=lambda x: x.mQueryFrom)

        if Genomics.IsPositiveStrand(matches[0].strand):
            f = lambda x, y: x.mSbjctTo < y.mSbjctFrom
        else:
            f = lambda x, y: x.mSbjctFrom > y.mSbjctTo

        for x in range(0, len(matches)):

            xx = matches[x]
            if options.loglevel >= 6:
                options.stdlog.write("# graph: %2i %s\n" % (x, str(xx)))

            for y in range(x + 1, len(matches)):
                yy = matches[y]
                d = min(xx.mQueryTo, yy.mQueryTo) - \
                    max(xx.mQueryFrom, yy.mQueryFrom)
                if d > 0 or not f(xx, yy):
                    continue
                else:
                    graph.add_edge(x, y, {'weight': -d})

        source = len(matches)
        target = len(matches) + 1
        for x in range(len(matches)):
            xx = matches[x]
            graph.add_edge(source, x, {'weight': xx.mQueryFrom})
            graph.add_edge(
                x, target, {'weight': xx.mQueryLength - xx.mQueryTo})

        if options.loglevel >= 6:
            networkx.write_edgelist(graph, options.stdlog)

        path = networkx.dijkstra_path(graph, source, target)

        if options.loglevel >= 6:
            options.stdlog.write("# path: %s\n" % (str(path)))

        new_matches = [matches[x] for x in path[1:-1]]

        if len(matches) != len(new_matches):
            E.warn(("query=%s, target=%s, strand=%s: "
                    "removed overlapping/out-of-order segments: "
                    "before=%i, after=%i") %
                   (matches[0].mQueryId,
                    matches[0].mSbjctId,
                    matches[0].strand,
                    len(matches),
                    len(new_matches)))

        matches = new_matches

        for match in matches:
            m = match.getMapQuery2Target()
            alignlib_lite.py_addAlignment2Alignment(map_query2target, m)

        new.fromMap(map_query2target, use_strand=True)

        options.stdout.write(str(new) + "\n")
        options.stdout.flush()
        return 1
示例#14
0
文件: psl2psl.py 项目: Q-KIM/cgat
def pslMap(options):
    """thread psl alignments using intervals.

    """

    if options.format == "gtf":
        use_copy = False
    else:
        use_copy = True

    c = E.Counter()

    min_length = options.min_aligned

    for match, qx, tx in iterator_psl_intervals(options):

        map_query2target = match.getMapQuery2Target()

        c.input += 1

        # if no filter on qx or tx, use full segment
        if qx is None:
            qx = [(match.mQueryFrom, match.mQueryTo, 0)]
        elif tx is None:
            tx = [(match.mSbjctFrom, match.mSbjctTo, 0)]

        E.debug('matches in query: %s' % qx)
        E.debug('matches in target: %s' % tx)

        # if no overlap: return
        if not qx or not tx:
            c.skipped += 1
            E.debug("no matches in query or target - skipped")
            continue

        for query in qx:

            qstart, qend, qval = query

            # skip elements that are too small
            if qend - qstart < min_length:
                E.debug("query too small - skipped at %s:%i-%i" %
                        (match.mQueryId, qstart, qend))
                c.skipped_small_queries += 1
                continue

            E.debug("working on query %s:%i-%i" %
                    (match.mQueryId, qstart, qend))

            mqstart, mqend = (
                map_query2target.mapRowToCol(
                    qstart,
                    alignlib_lite.py_RIGHT),
                map_query2target.mapRowToCol(
                    qend,
                    alignlib_lite.py_LEFT))

            if match.strand == "-":
                qstart, qend = match.mQueryLength - \
                    qend, match.mQueryLength - qstart

            for target in tx:

                tstart, tend, tval = target
                if (tstart >= mqend or tend <= mqstart):
                    E.debug("no overlap: %i-%i (%i-%i) - %i-%i" % (
                        qstart, qend, mqstart, mqend, tstart, tend))
                    continue
                if tend - tstart < min_length:
                    E.debug("target length too short: %i-%i - %i-%i" % (
                        qstart, qend, tstart, tend))
                    continue

                new = alignlib_lite.py_makeAlignmentBlocks()

                if use_copy:
                    # do copy with range filter
                    if options.loglevel >= 3:

                        mtstart, mtend = map_query2target.mapColToRow(
                            tstart), map_query2target.mapColToRow(tend)

                        E.debug(
                            ("query: %i-%i (len=%i)-> %i-%i(len=%i); "
                             "target: %i-%i (len=%i)-> %i-%i (len=%i)") %
                            (qstart, qend,
                             qend - qstart,
                             mqstart, mqend,
                             mqend - mqstart,
                             tstart, tend,
                             tend - tstart,
                             mtstart, mtend,
                             mtend - mtstart))

                    alignlib_lite.py_copyAlignment(
                        new,
                        map_query2target,
                        qstart, qend,
                        tstart, tend)
                else:
                    # do copy with alignment filter
                    map_query = qval
                    if map_query:
                        tmp = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            tmp, map_query2target, map_query,
                            alignlib_lite.py_RR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping query ###########\n")
                            options.stdlog.write(
                                "# %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        map_query)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        tmp)))
                    else:
                        tmp = map_query2target

                    map_target = tval
                    if map_target:
                        new = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            new, tmp, map_target, alignlib_lite.py_CR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping target ###########\n")
                            options.stdlog.write(
                                "# before: %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    tmp)))
                            options.stdlog.write(
                                "# map   : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_target)))
                            options.stdlog.write(
                                "# after : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))
                    else:
                        new = tmp

                if options.loglevel >= 4:
                    E.debug("putative match with intervals: %s and %s: %i-%i" %
                            (str(query), str(target), qstart, qend))
                    if options.loglevel >= 5:
                        E.debug(
                            "input : %s" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                        E.debug("final : %s" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))

                    if new.getLength() > 0:
                        n = match.copy()
                        n.fromMap(new, use_strand=True)
                        E.info("match : %s" % (str(n)))

                if new.getNumAligned() > options.min_aligned:
                    n = match.copy()
                    n.fromMap(new, use_strand=True)
                    options.stdout.write(str(n) + "\n")
                    c.output += 1
                else:
                    c.discarded += 1
                break
            else:
                c.nooverlap += 1

    E.info("map: %s" % str(c))
示例#15
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $", 
                                    usage = globals()["__doc__"] )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    psl = None

    def chain_iterator( infile ):
        lines = []
        for line in options.stdin:
            
            if line.startswith("#"): continue
            if line.strip() == "": continue
            if line.startswith("chain"):
                if lines: yield lines
                lines = []
            lines.append( line )
            
        yield lines

    for lines in chain_iterator(options.stdin):
        
        ninput += 1
        psl = Blat.Match()

        ( _, 
          _, 
          psl.mSbjctId,
          target_length,
          target_strand,
          target_start,
          target_end,
          psl.mQueryId,
          query_length,
          query_strand,
          query_start, 
          query_end,
          alignment_id ) = lines[0][:-1].split()
        
        ( psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength,
          psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength ) = \
        [ int(x) for x in 
          (query_start, 
           query_end,
           query_length,
           target_start, 
           target_end,
           target_length) ]

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()
        
        qstart, tstart = psl.mQueryStart, psl.mSbjctStart
        
        for line in lines[1:-1]:
            size, dt, dq = [int(x) for x in line[:-1].split() ]
            map_query2target.addDiagonal( qstart,
                                          qstart + size,
                                          tstart - qstart )
            qstart += size + dq
            tstart += size + dt

        size = int(lines[-1][:-1])

        map_query2target.addDiagonal( qstart,
                                      qstart + size,
                                      tstart - qstart )

        psl.fromMap( map_query2target )

        # sort out strand
        # target_strand is always positive
        assert( target_strand == "+" )

        # if query strand is negative
        if query_strand == "-": 
            # invert both query and target
            psl.switchTargetStrand()
            # manually invert the query coordinates
            psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom

        options.stdout.write("%s\n" % psl )
        noutput += 1

    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) )

    ## write footer and output benchmark information.
    E.Stop()
示例#16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: psl2wiggle_stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--wiggle-files", dest="wiggle_files", type="string",
                      help="glob expression for wiggle files [%default].")

    parser.add_option("--prefix", dest="prefix", type="string",
                      help="prefix to add to contig names before lookup [%default].")

    parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true",
                      help="input is zipped.")

    parser.add_option("--test", dest="test", type="int",
                      help="test - stop after # rows of parsing [%default].")

    parser.add_option("--with-values", dest="with_values", action="store_true",
                      help="output values in last column [%default].")

    parser.set_defaults(wiggle_files="*.data.bz2",
                        from_zipped=False,
                        prefix="",
                        with_values=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    # open indexed access to wiggles
    wiggle_files = glob.glob(options.wiggle_files)
    if not wiggle_files:
        raise IOError("could not find wiggle files with '%s'" %
                      options.wiggle_files)

    index = Wiggle.WiggleMultiIndexedAccess(wiggle_files,
                                            keep_open=True,
                                            use_cache=False)

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "query\tnali\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders())))
    if options.with_values:
        options.stdout.write("\tvalues")
    options.stdout.write("\n")

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        if options.loglevel >= 2:
            options.stdlog.write(str(match) + "\n")

        # psl always matches on the forward strand

        map_genome2query = alignlib_lite.py_makeAlignmentBlocks()
        f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % (
            match.mSbjctFrom,
            match.mSbjctTo,
            match.mQueryFrom,
            match.mQueryTo,
            match.mSbjctBlockStarts,
            match.mQueryBlockStarts,
            match.mBlockSizes))
        f.copy(map_genome2query)

        data = index.get(options.prefix + match.mSbjctId,
                         match.mSbjctFrom,
                         match.mSbjctTo)

        values = []
        for x, vv in data:
            for v in vv:
                if map_genome2query.mapRowToCol(x) >= 0:
                    values.append(v)
                x += 1
        if len(values) == 0:
            nskipped += 1
            continue

        noutput += 1

        if options.loglevel >= 2:
            options.stdlog.write(
                "# %s\n" % ",".join(["%5.3f" % v for v in values]))

        s = Stats.DistributionalParameters(values)
        options.stdout.write("%s\t%i\t%s" % (match.mQueryId,
                                             match.mNMismatches +
                                             match.mNMatches,
                                             str(s)))

        if options.with_values:
            options.stdout.write(
                "\t%s" % ",".join(["%5.3f" % v for v in values]))

        options.stdout.write("\n")

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped))

    E.Stop()
示例#17
0
import alignlib_lite as alignlib

x = alignlib.py_makeAlignmentBlocks()

x.addDiagonal( 10, 100, 0 )

print x.getNumAligned(), x.getRowFrom(), x.getRowTo()


f = alignlib.py_AlignmentFormatBlat( x )

print str(f)

f.copy( x )

print str(f)