Python BlastAlignments примеры использования

Язык программирования: Python

Пространство имен/Пакет: CGAT

Класс/Тип: BlastAlignments

Примеров на hotexamples.com: 6

Python BlastAlignments - 6 примеров найдено. Это лучшие примеры Python кода для CGAT.BlastAlignments, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Link(3)

Map(1)

ReadMap(1)

iterator_links(1)

Основные методы

Link (3)

Map (1)

ReadMap (1)

iterator_links (1)

Пример #1

Показать файл

Файл: links2fasta.py Проект: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--sequences",
                      dest="filename_sequences",
                      type="string",
                      help="peptide sequence [Default=%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output format [Default=%default]")

    parser.add_option(
        "-e",
        "--expand",
        dest="expand",
        action="store_true",
        help=
        "expand positions from peptide to nucleotide alignment [Default=%default]"
    )

    parser.add_option("-m",
                      "--map",
                      dest="filename_map",
                      type="string",
                      help="map alignments [Default=%default]")

    parser.add_option("-c",
                      "--codons",
                      dest="require_codons",
                      action="store_true",
                      help="require codons [Default=%default]")

    parser.add_option(
        "--one-based-coordinates",
        dest="one_based_coordinates",
        action="store_true",
        help=
        "expect one-based coordinates. The default are zero based coordinates [Default=%default]."
    )

    parser.add_option("--no-identical",
                      dest="no_identical",
                      action="store_true",
                      help="do not output identical pairs [Default=%default]")

    parser.add_option(
        "-g",
        "--no-gaps",
        dest="no_gaps",
        action="store_true",
        help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option("-x",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exon boundaries [Default=%default]")

    parser.add_option("-o",
                      "--outfile",
                      dest="filename_outfile",
                      type="string",
                      help="filename to save links [Default=%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of alignment [Default=%default]")

    parser.add_option(
        "--filter",
        dest="filename_filter",
        type="string",
        help=
        "given a set of previous alignments, only write new pairs [Default=%default]."
    )

    parser.set_defaults(filename_sequences=None,
                        filename_exons=None,
                        filename_map=None,
                        filename_outfile=None,
                        no_gaps=False,
                        format="fasta",
                        expand=False,
                        require_codons=False,
                        no_identical=False,
                        min_length=0,
                        report_step=100,
                        one_based_coordinates=False,
                        filename_filter=None)

    (options, args) = E.Start(parser, add_mysql_options=True)

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences(
            open(options.filename_sequences, "r"))
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i sequences\n" % len(sequences))
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"))
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i exons\n" % len(exons))
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#":
                continue
            m = Map()
            m.read(line)
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i maps\n" % len(map_old2new))
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:
            options.stdlog.write("# reading filtering information.\n")
            sys.stdout.flush()

        map_pair2hids = {}

        if os.path.exists(options.filename_filter):

            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator(infile)

            while 1:
                cur_record = iterator.next()
                if cur_record is None:
                    break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None:
                    break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids:
                    map_pair2hids[id] = []

                map_pair2hids[id].append(s)

            infile.close()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# read filtering information for %i pairs.\n" %
                len(map_pair2hids))
            sys.stdout.flush()
    else:
        map_pair2hids = None

    if options.loglevel >= 1:
        options.stdlog.write("# finished input in %i seconds.\n" %
                             (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None

    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links(sys.stdin):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write("# iterations: %i in %i seconds.\n" %
                                     (iterations, time.time() - t1))
                sys.stdout.flush()

        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write("# read link %s\n" % str(link))

        row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken])
        col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken])

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment(link.mQueryAli, 3)
            link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3)

        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli, link.mSbjctFrom,
            link.mSbjctAli).copy(map_row2col)

        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in row with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mQueryToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New,
                map_row2col, alignlib_lite.py_RR)
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in col with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mSbjctToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_row2col,
                map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR)
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        dr = row_seq.getLength() - map_row2col.getRowTo()
        dc = col_seq.getLength() - map_row2col.getColTo()
        if dr < 0 or dc < 0:
            raise ValueError(
                "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s"
                %
                (link.mQueryToken, link.mSbjctToken, row_seq.getLength(),
                 col_seq.getLength(),
                 str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))

        if options.loglevel >= 2:
            options.stdlog.write(
                str(
                    alignlib_lite.py_AlignmentFormatExplicit(
                        map_row2col, row_seq, col_seq)) + "\n")
        # check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()

            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write("# %s\n" % str(map_row2col))
                options.stdlog.write("# %s\n" % str(link))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mQueryToken]))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mSbjctToken]))
                options.stdlog.write("#\n%s\n" %
                                     alignlib_lite.py_AlignmentFormatExplicit(
                                         map_row2col, row_seq, col_seq))

                raise ValueError(
                    "incomplete codons %i in pair %s - %s" %
                    (naligned, link.mQueryToken, link.mSbjctToken))

        # if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            # Get overlapping segments
            segments = Exons.MatchExons(map_row2col, exons1, exons2)

            for a, b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in
                # the input files.

                from1, to1 = GetAdjustedBoundaries(a, exons1)
                from2, to2 = GetAdjustedBoundaries(b, exons2)

                alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col,
                                               from1 + 1, to1, from2 + 1, to2)

                mode = Write(tmp1_map_row2col,
                             row_seq,
                             col_seq,
                             link,
                             no_gaps=options.no_gaps,
                             no_identical=options.no_identical,
                             min_length=options.min_length,
                             suffix1="_%s" % str(a),
                             suffix2="_%s" % str(b),
                             outfile=outfile,
                             pair_filter=map_pair2hid,
                             format=options.format)

                if mode not in counts:
                    counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write(map_row2col,
                         row_seq,
                         col_seq,
                         link,
                         min_length=options.min_length,
                         no_gaps=options.no_gaps,
                         no_identical=options.no_identical,
                         outfile=outfile,
                         pair_filter=map_pair2hids,
                         format=options.format)

            if mode not in counts:
                counts[mode] = 0
            counts[mode] += 1

        noutput += 1

    if outfile:
        outfile.close()

    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join(
            map(lambda x, y: "%s=%i" %
                (x, y), counts.keys(), counts.values())))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()

Пример #2

Показать файл

Файл: filter_paralogous_links.py Проект: santayana/cgat

    if param_loglevel >= 1:
        print "# read %i cds" % (len(cds))
        sys.stdout.flush()

    ninput, npairs, nskipped = 0, 0, 0

    for line in sys.stdin:
        if line[0] == "#":
            continue
        if line[0] == ">":
            print line[:-1]
            continue

        ninput += 1
        link = BlastAlignments.Link()

        link.Read(line)

        if link.mQueryToken == link.mSbjctToken:
            continue

        keep = 1
        if link.mQueryToken in cds and link.mSbjctToken in cds:
            is_paralog, reason = IsParalogLink(link, cds[link.mQueryToken],
                                               cds[link.mSbjctToken])
            if is_paralog:
                keep = 0
                if param_loglevel >= 2:
                    print "# DISCARDED because %s: %s" % (reason, str(link))
        else:

Пример #3

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser.add_option("-n",
                      "--vertices",
                      dest="vertices",
                      action="append",
                      help="filename with vertices.")
    parser.add_option("-e",
                      "--extra",
                      dest="filename_extra",
                      type="string",
                      help="filename to store extra vertices in.")
    parser.add_option("-m",
                      "--missed",
                      dest="filename_missed",
                      type="string",
                      help="filename to store missed vertices in.")

    parser.set_defaults(
        vertices=[],
        filename_extra=None,
        filename_missed=None,
    )

    (options, args) = E.Start(parser)

    if len(options.vertices) == "":
        raise "please specify one set of vertices."

    vertices = {}
    index = 0
    missed_queries = []
    nvertices = [0] * len(options.vertices)
    for x in range(len(options.vertices)):
        f = options.vertices[x]
        vv = map(lambda x: x[:-1].split("\t")[0],
                 filter(lambda x: x[0] != "#",
                        open(f, "r").readlines()))
        nvertices[x] = len(vv)
        missed_queries.append([])
        for v in vv:
            vertices[v] = [x, 0, 0]
        if options.loglevel >= 1:
            print "# read %i vertices from %s" % (len(vv), f)
            sys.stdout.flush()

    l = BlastAlignments.Link()
    extra_vertices = {}
    for line in sys.stdin:

        if line[0] == "#":
            continue

        l.Read(line)

        if l.mQueryToken in vertices:
            vertices[l.mQueryToken][1] += 1
        else:
            extra_vertices[l.mQueryToken] = 1

        if l.mSbjctToken in vertices:
            vertices[l.mSbjctToken][2] += 1
        else:
            extra_vertices[l.mSbjctToken] = 1

    found_queries = [0] * len(options.vertices)
    found_sbjcts = [0] * len(options.vertices)

    for v, vv in vertices.items():
        index, nquery, nsbjct = vv
        if nquery:
            found_queries[index] += 1
        else:
            missed_queries[index].append(v)

        if nsbjct:
            found_sbjcts[index] += 1

    headers = ("set", "name", "tvertex", "nmissed", "pmissed", "nquery",
               "pquery", "nsbjct", "psbjct")

    print "\t".join(headers)

    for x in range(len(options.vertices)):
        print "%i\t%s\t%i\t%i\t%5.2f\t%i\t%5.2f\t%i\t%5.2f" % (
            x, options.vertices[x], nvertices[x], len(missed_queries[x]),
            100 * float(len(missed_queries[x])) / nvertices[x],
            found_queries[x], 100 * float(found_queries[x]) / nvertices[x],
            found_sbjcts[x], 100 * float(found_sbjcts[x]) / nvertices[x])

    print "//"
    print "%i vertices not in set" % len(extra_vertices)

    if options.filename_extra and len(extra_vertices) > 0:
        outfile = open(options.filename_extra, "w")
        for x in extra_vertices.keys():
            outfile.write("%s\n" % x)
        outfile.close()

    if options.filename_missed:
        outfile = open(options.filename_missed, "w")
        for x in range(len(options.vertices)):
            for y in missed_queries[x]:
                outfile.write("%i\t%s\t%s\n" % (x, options.vertices[x], y))

    E.Stop()

Пример #4

Показать файл

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: graph_map_links.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-q",
                      "--map-query",
                      dest="filename_map_query",
                      type="string",
                      help="filename with queries to map.")

    parser.add_option("-s",
                      "--map-sbjct",
                      dest="filename_map_sbjct",
                      type="string",
                      help="filename with sbjcts to map.")

    parser.add_option("-m",
                      "--multiple",
                      dest="multiple",
                      action="store_true",
                      help="map multiple options [%default].")

    parser.add_option("-k",
                      "--keep-unmapped",
                      dest="keep_unmapped",
                      action="store_true",
                      help="keep unmapped entries [%default].")

    parser.add_option("-i",
                      "--map-identity",
                      dest="map_identity",
                      action="store_true",
                      help="map by identifier [%default].")

    parser.add_option(
        "-n",
        "--non-redundant",
        dest="non_redundant",
        action="store_true",
        help=
        "write only unique links (requires a lot of memory for large graphs) [%default]"
    )

    parser.set_defaults( \
        filename_map_query = None,
        filename_map_sbjct = None,
        multiple = False,
        keep_unmapped = False,
        map_identity = False,
        report_step = 1000000,
        non_redundant = False)

    (options, args) = E.Start(parser)

    if options.filename_map_query:
        infile = IOTools.openFile(options.filename_map_query, "r")
        if options.map_identity:
            map_query = readIdentityMap(infile)
        else:
            map_query = BlastAlignments.ReadMap(infile, options.multiple)
        infile.close()
        E.info('read maps for %i queries' % len(map_query))
    else:
        map_query = None

    if options.filename_map_sbjct:
        if options.filename_map_sbjct == options.filename_map_query:
            map_sbjct = map_query
        else:
            infile = IOTools.openFile(options.filename_map_sbjct, "r")
            if options.map_identity:
                map_sbjct = readIdentityMap(infile)
            else:
                map_sbjct = BlastAlignments.ReadMap(infile, options.multiple)
            infile.close()
        E.info('read maps for %i sbjcts' % len(map_sbjct))
    else:
        map_sbjct = None

    nfailed = 0
    ninput = 0
    nskipped = 0
    noutput = 0

    # number of identical/mapped links
    nsame, nmapped = 0, 0

    printed = {}

    alignment = BlastAlignments.Map()

    for line in options.stdin:

        if line[0] == "#": continue

        data = line[:-1].split("\t")

        alignment.Read(line)
        skip = False
        ninput += 1

        E.debug(str(map))

        if options.loglevel >= 2 and ninput % options.report_step == 0:
            options.stderr.write(
                "# progress: ninput=%i, noutput=%i, nhash=%i\n" %
                (ninput, noutput, len(printed)))

        if options.multiple:
            skip = False
            if map_query != None:
                if alignment.mQueryToken in map_query:
                    mq = map_query[alignment.mQueryToken]
                else:
                    skip = True
            else:
                mq = [None]

            if map_sbjct != None:
                if alignment.mSbjctToken in map_sbjct:
                    ms = map_sbjct[alignment.mSbjctToken]
                else:
                    skip = True
            else:
                ms = [None]

            if skip:
                nskipped += 1
                continue

            if options.map_identity:

                ## only if non_redundant is set, do global comparison
                if not options.non_redundant: printed = {}

                new_map = alignment.GetClone()
                do_redundant = len(mq) > 1 or len(ms) > 1
                for q in mq:
                    for s in ms:

                        new_map.mQueryToken = q
                        new_map.mSbjctToken = s

                        ## check for non-redundant links for 1:many or many:many mappings
                        if do_redundant:
                            key = "%s-%i-%i-%s-%i-%i" % (
                                new_map.mQueryToken, new_map.mQueryFrom,
                                new_map.mQueryTo, new_map.mSbjctToken,
                                new_map.mSbjctFrom, new_map.mSbjctTo)

                            # hash key to save space
                            hkey = hashlib.md5(key).digest()

                            if hkey in printed: continue

                            printed[hkey] = 1

                        options.stdout.write('\t'.join([str(new_map)] +
                                                       data[9:]) + '\n')
                        noutput += 1
                        if new_map.mQueryToken == alignment.mQueryToken and \
                                new_map.mSbjctToken == alignment.mSbjctToken:
                            nsame += 1
                        else:
                            nmapped += 1

            else:
                for q in mq:
                    for s in ms:
                        new_map = alignment.GetClone()

                        E.debug(str(q))
                        E.debug(str(s))

                        is_ok = new_map.MapAlignment(q, s)

                        if not is_ok:
                            nfailed += 1
                        else:
                            options.stdout.write('\t'.join([str(new_map)] +
                                                           data[9:]) + '\n')
                            noutput += 1

        # options.multiple == False
        else:

            if map_query != None:
                if alignment.mQueryToken in map_query:
                    mq = map_query[alignment.mQueryToken]
                else:
                    mq = None
                    skip = True
            else:
                mq = None

            if map_sbjct != None:
                if alignment.mSbjctToken in map_sbjct:
                    ms = map_sbjct[alignment.mSbjctToken]
                else:
                    ms = None
                    skip = True
            else:
                ms = None

            if skip and not options.keep_unmapped:
                nskipped += 1
                continue

            E.debug(str(mq))
            E.debug(str(ms))

            if mq or ms:
                is_ok = alignment.MapAlignment(mq, ms)
            else:
                is_ok = True

            if not is_ok:
                nfailed += 1
            else:
                options.stdout.write('\t'.join([str(alignment)] + data[9:]) +
                                     '\n')
                noutput += 1

    E.info( 'ninput=%i, noutput=%i, nskipped=%i, nfailed=%i, nsame=%i, nmapped=%i' % \
                (ninput, noutput, nskipped, nfailed, nsame, nmapped ))

    E.Stop()

Пример #5

Показать файл

Файл: links2fasta.py Проект: BioinformaticsArchive/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "-s", "--sequences", dest="filename_sequences", type="string",
                       help="peptide sequence [Default=%default]" )

    parser.add_option( "-f", "--format", dest="format", type="string",
                       help="output format [Default=%default]" )

    parser.add_option( "-e", "--expand",  dest="expand", action="store_true",
                       help="expand positions from peptide to nucleotide alignment [Default=%default]")

    parser.add_option( "-m", "--map",  dest="filename_map", type="string",
                       help="map alignments [Default=%default]")
    
    parser.add_option( "-c", "--codons",  dest="require_codons", action="store_true",
                       help="require codons [Default=%default]")

    parser.add_option( "--one-based-coordinates",  dest="one_based_coordinates", action="store_true",
                       help="expect one-based coordinates. The default are zero based coordinates [Default=%default].")

    parser.add_option( "--no-identical",  dest="no_identical", action="store_true",
                       help="do not output identical pairs [Default=%default]" )

    parser.add_option( "-g", "--no-gaps",  dest="no_gaps", action="store_true",
                       help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option( "-x", "--exons",  dest="filename_exons", type="string",
                       help="filename with exon boundaries [Default=%default]")
    
    parser.add_option( "-o", "--outfile",  dest="filename_outfile", type="string",
                       help="filename to save links [Default=%default]")

    parser.add_option( "--min-length",  dest="min_length", type="int",
                       help="minimum length of alignment [Default=%default]")

    parser.add_option( "--filter",  dest="filename_filter", type="string",
                       help="given a set of previous alignments, only write new pairs [Default=%default].")

    parser.set_defaults(
        filename_sequences = None,
        filename_exons = None,
        filename_map = None,
        filename_outfile = None,
        no_gaps = False,
        format = "fasta",
        expand = False,
        require_codons = False,
        no_identical = False,
        min_length = 0,
        report_step = 100,
        one_based_coordinates = False,
        filename_filter = None)

    (options, args) = E.Start( parser, add_mysql_options = True )

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") )
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i sequences\n" % len(sequences) )
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries( open(options.filename_exons, "r") )
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i exons\n" % len(exons) )
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#": continue
            m = Map()
            m.read( line )
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i maps\n" % len(map_old2new) )
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:        
            options.stdlog.write( "# reading filtering information.\n" )
            sys.stdout.flush()
            
        map_pair2hids = {}

        if os.path.exists( options.filename_filter ):
            
            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator( infile )

            while 1:
                cur_record = iterator.next()
                if cur_record is None: break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None: break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids: map_pair2hids[id] = []

                map_pair2hids[id].append( s )

            infile.close()
            
        if options.loglevel >= 1:        
            options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids) )
            sys.stdout.flush()
    else:
        map_pair2hids = None
        
    if options.loglevel >= 1:
        options.stdlog.write( "# finished input in %i seconds.\n" % (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None
        
    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links( sys.stdin ):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write( "# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1) )
                sys.stdout.flush()
                
        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write( "# read link %s\n" %  str(link) )
            
        row_seq = alignlib_lite.py_makeSequence( sequences[link.mQueryToken] )
        col_seq = alignlib_lite.py_makeSequence( sequences[link.mSbjctToken] )

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3 
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment( link.mQueryAli, 3 )
            link.mSbjctAli = ScaleAlignment( link.mSbjctAli, 3 )            
            
        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli,
            link.mSbjctFrom, link.mSbjctAli ).copy(  map_row2col )
        
        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write( "# combining in row with %s\n" %\
                                      str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mQueryToken].mMapOld2New ) ))

            alignlib_lite.py_combineAlignment( tmp1_map_row2col,
                                      map_old2new[link.mQueryToken].mMapOld2New,
                                      map_row2col,
                                      alignlib_lite.py_RR )
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col )

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()            
            if options.loglevel >= 3:
                options.stdlog.write( "# combining in col with %s\n" %\
                                      str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mSbjctToken].mMapOld2New ) ))

            alignlib_lite.py_combineAlignment( tmp1_map_row2col,
                                       map_row2col,
                                       map_old2new[link.mSbjctToken].mMapOld2New,
                                       alignlib_lite.py_CR )
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col )

        dr = row_seq.getLength() - map_row2col.getRowTo() 
        dc = col_seq.getLength() - map_row2col.getColTo() 
        if dr < 0 or dc < 0:
            raise ValueError("out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" %\
                                          (link.mQueryToken,
                                           link.mSbjctToken,
                                           row_seq.getLength(),
                                           col_seq.getLength(),
                                           str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))
            

        if options.loglevel >= 2:
            options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, 
                                                                         row_seq, 
                                                                         col_seq )) + "\n" )
        ## check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()
            
            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write( "# %s\n" % str(map_row2col) )
                options.stdlog.write( "# %s\n" % str(link) )
                options.stdlog.write( "# %s\n" % str(map_old2new[link.mQueryToken]) )
                options.stdlog.write( "# %s\n" % str(map_old2new[link.mSbjctToken]) )
                options.stdlog.write( "#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, 
                                                                                    row_seq,
                                                                                    col_seq ) )

                raise ValueError("incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken))

        ## if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            ## Get overlapping segments
            segments = Exons.MatchExons( map_row2col, exons1, exons2 )
            
            for a,b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in the input files.

                from1, to1 = GetAdjustedBoundaries( a, exons1 )
                from2, to2 = GetAdjustedBoundaries( b, exons2 )

                alignlib_lite.py_copyAlignment( tmp1_map_row2col, map_row2col,
                                       from1+1, to1, from2+1, to2 )
                
                mode = Write( tmp1_map_row2col, row_seq, col_seq, link,
                              no_gaps = options.no_gaps,
                              no_identical = options.no_identical,
                              min_length = options.min_length,
                              suffix1="_%s" % str(a),
                              suffix2="_%s" % str(b),
                              outfile = outfile,
                              pair_filter = map_pair2hid,
                              format = options.format )

                if mode not in counts: counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write( map_row2col, row_seq, col_seq, link,
                          min_length = options.min_length,                          
                          no_gaps = options.no_gaps,
                          no_identical = options.no_identical,
                          outfile = outfile,
                          pair_filter = map_pair2hids,
                          format = options.format )
            
            if mode not in counts: counts[mode] = 0
            counts[mode] += 1

        noutput += 1
        
    if outfile: outfile.close()
    
    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join( map( lambda x,y: "%s=%i" % (x,y), counts.keys(), counts.values() ) ))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped) )

    E.Stop()

Пример #6

Показать файл

Файл: blast2fasta.py Проект: yangjl/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: blast2fasta.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--sequences",
                      dest="filename_sequences",
                      type="string",
                      help="filename with sequences.")
    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output format.")

    parser.set_defaults(
        filename_sequences=None,
        format="fasta",
    )

    (options, args) = E.Start(parser)

    if not options.filename_sequences:
        raise "please supply filename with sequences."

    sequences = Genomics.ReadPeptideSequences(
        open(options.filename_sequences, "r"))

    if options.loglevel >= 1:
        print "# read %i sequences" % len(sequences)

    for k in sequences.keys():
        sequences[k] = alignlib_lite.py_makeSequence(sequences[k])

    if options.loglevel >= 2:
        print "# converted %i sequences" % len(sequences)

    ninput, noutput, nskipped, nfailed = 0, 0, 0, 0
    link = BlastAlignments.Link()

    ali = alignlib_lite.py_makeAlignataVector()

    for line in sys.stdin:

        if line[0] == "#": continue

        link.Read(line)
        ninput += 1

        if link.mQueryToken not in sequences or link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        ali.Clear()
        alignlib_lite.py_fillAlignataCompressed(ali, link.mQueryFrom,
                                                link.mQueryAli,
                                                link.mSbjctFrom,
                                                link.mSbjctAli)

        result = alignlib_lite.py_writePairAlignment(
            sequences[link.mQueryToken], sequences[link.mSbjctToken],
            ali).split("\n")

        if len(result) != 3:
            nfailed += 1

        if options.format == "fasta":
            print ">%s %i-%i\n%s\n>%s %i-%i\n%s\n" %\
                  (link.mQueryToken, link.mQueryFrom, link.mQueryTo, result[0].split("\t")[1],
                   link.mSbjctToken, link.mSbjctFrom, link.mSbjctTo, result[1].split("\t")[1] )

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, nfailed=%i" %
           (ninput, noutput, nskipped, nfailed))
    E.Stop()