Exemplo n.º 1
0
    def create(self, infile):
        """create profile library from file."""

        self.mOutfileDatabase = open(self.mFilenameProfiles, "wb")
        outfile_index = open(self.mFilenameIndex, "w")

        ninput, noutput = 0, 0

        while mali.readFromFile(sys.stdin, format="profile"):

            ninput += 1

            m = Mali.convertMali2Alignlib(mali)
            p = alignlib_lite.py_makeProfile(m, weightor=self.mWeightor)
            p.prepare()

            self.appendProfile(mali.getName(), p)

            noutput += 1

        return ninput, noutput
Exemplo n.º 2
0
    def create(self, infile):
        """create profile library from file."""

        self.mOutfileDatabase = open(self.mFilenameProfiles, "wb")
        outfile_index = open(self.mFilenameIndex, "w")

        ninput, noutput = 0, 0

        while mali.readFromFile(sys.stdin, format="profile"):

            ninput += 1

            m = Mali.convertMali2Alignlib(mali)
            p = alignlib_lite.py_makeProfile(m, weightor=self.mWeightor)
            p.prepare()

            self.appendProfile(mali.getName(), p)

            noutput += 1

        return ninput, noutput
Exemplo n.º 3
0
    def verify(self, infile):
        """verify data in database against original data."""

        if not self.mIndex:
            self.__loadIndex()

        ninput, nfound, nnotfound, ndifferent = 0, 0, 0, 0
        while mali.readFromFile(sys.stdin, format="profile"):

            ninput += 1
            m = Mali.convertMali2Alignlib(mali)
            p1 = alignlib_lite.py_makeProfile(m)
            p1.prepare()

            p2 = self.getProfile(mali.getName())

            if p1.getLength() != p2.getLength() or \
                    str(p1) != str(p2):
                ndifferent += 1
                continue

            nfound += 1

        return ninput, nfound, nnotfound, ndifferent
Exemplo n.º 4
0
    def verify(self, infile):
        """verify data in database against original data."""

        if not self.mIndex:
            self.__loadIndex()

        ninput, nfound, nnotfound, ndifferent = 0, 0, 0, 0
        while mali.readFromFile(sys.stdin, format="profile"):

            ninput += 1
            m = Mali.convertMali2Alignlib(mali)
            p1 = alignlib_lite.py_makeProfile(m)
            p1.prepare()

            p2 = self.getProfile(mali.getName())

            if p1.getLength() != p2.getLength() or \
                    str(p1) != str(p2):
                ndifferent += 1
                continue

            nfound += 1

        return ninput, nfound, nnotfound, ndifferent
Exemplo n.º 5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-o", "--gop", dest="gop", type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("-e", "--gep", dest="gep", type="float",
                      help="gap extension penalty [default=%default].")

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices=("global", "local"),
                      help="alignment mode, global=nw, local=sw [default=%default].")

    parser.set_defaults(
        gop=-12.0,
        gep=-2.0,
        format="fasta",
        mode="local",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info("read 2 multiple alignments")

    mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format)
    mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format)

    cmali1 = Mali.convertMali2Alignlib(mali1)
    cmali2 = Mali.convertMali2Alignlib(mali2)

    if options.mode == "local":
        mode = alignlib_lite.py_ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib_lite.py_ALIGNMENT_GLOBAL

    alignator = alignlib_lite.py_makeAlignatorDPFull(mode,
                                                     options.gop, options.gep)

    alignlib_lite.py_setDefaultEncoder(
        alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20))
    alignlib_lite.py_setDefaultLogOddor(
        alignlib_lite.py_makeLogOddorDirichlet(0.3))
    alignlib_lite.py_setDefaultRegularizor(
        alignlib_lite.py_makeRegularizorDirichletPrecomputed())

    cprofile1 = alignlib_lite.py_makeProfile(cmali1)
    cprofile2 = alignlib_lite.py_makeProfile(cmali2)

    result = alignlib_lite.py_makeAlignmentVector()

    alignator.align(result, cprofile1, cprofile2)

    E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result))

    cmali1.add(cmali2, result)

    outmali = Mali.convertAlignlib2Mali(cmali1,
                                        identifiers=mali1.getIdentifiers() + mali2.getIdentifiers())

    outmali.writeToFile(options.stdout, format=options.format)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("-e",
                      "--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.add_option(
        "-m",
        "--mode",
        dest="mode",
        type="choice",
        choices=("global", "local"),
        help="alignment mode, global=nw, local=sw [default=%default].")

    parser.set_defaults(
        gop=-12.0,
        gep=-2.0,
        format="fasta",
        mode="local",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info("read 2 multiple alignments")

    mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format)
    mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format)

    cmali1 = Mali.convertMali2Alignlib(mali1)
    cmali2 = Mali.convertMali2Alignlib(mali2)

    if options.mode == "local":
        mode = alignlib_lite.py_ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib_lite.py_ALIGNMENT_GLOBAL

    alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop,
                                                     options.gep)

    alignlib_lite.py_setDefaultEncoder(
        alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20))
    alignlib_lite.py_setDefaultLogOddor(
        alignlib_lite.py_makeLogOddorDirichlet(0.3))
    alignlib_lite.py_setDefaultRegularizor(
        alignlib_lite.py_makeRegularizorDirichletPrecomputed())

    cprofile1 = alignlib_lite.py_makeProfile(cmali1)
    cprofile2 = alignlib_lite.py_makeProfile(cmali2)

    result = alignlib_lite.py_makeAlignmentVector()

    alignator.align(result, cprofile1, cprofile2)

    E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result))

    cmali1.add(cmali2, result)

    outmali = Mali.convertAlignlib2Mali(cmali1,
                                        identifiers=mali1.getIdentifiers() +
                                        mali2.getIdentifiers())

    outmali.writeToFile(options.stdout, format=options.format)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 7
0
def _alignToProfile(infile, outfile, min_score=0):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile(open("../data/mouse.fasta"))
    src_mali = Mali.convertMali2Alignlib(mali)

    E.debug("read mali: %i sequences x %i columns" %
            (mali.getNumSequences(), mali.getNumColumns()))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns()
    for x in "ACGT":
        for y in range(0, 2):
            profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n)

    profile_mali = Mali.convertMali2Alignlib(profile_mali)
    alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4))
    alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform())

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile(profile_mali)

    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5)

    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal(0, n, 0)
    build_mali.add(src_mali, m)

    outf = open(outfile, "w")
    outf_log = open(outfile + ".info", "w")
    outf_log.write(
        "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n"
    )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append(re.sub("-", "", mali[pid]))
        ids.append(pid)

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator(open(infile)):

        E.debug("adding %s" % s.title)
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence(s.sequence)
        rseq = alignlib.makeSequence(rsequence)

        alignator.align(map_seq2profile, seq, profile)
        alignator.align(map_rseq2profile, rseq, profile)

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score:
            c.skipped += 1
            continue

        r = getParts(m)

        covered = 0
        for mm in r:
            build_mali.add(mm)
            sequences.append(sequence)
            ids.append(s.title)
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write("\t".join(
            map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(),
                      len(r), covered, "%5.2f" %
                      (100.0 * covered / len(s.sequence)), m.getScore(),
                      m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" %
                      ((100.0 * mali_covered) / mali.getNumColumns())))) +
                       "\n")

        c.output += 1

    #build_mali.expand( aa )
    result = str(
        alignlib.MultAlignmentFormatPlain(build_mali, sequences,
                                          alignlib.UnalignedStacked))

    for pid, data in zip(ids, result.split("\n")):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" %
                   (pid, int(start) + 1, int(end), sequence))

    outf.close()
    outf_log.close()

    E.info("%s\n" % str(c))
Exemplo n.º 8
0
def _alignToProfile( infile, outfile, 
                     min_score = 0 ):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile( open("../data/mouse.fasta") )
    src_mali = Mali.convertMali2Alignlib( mali )
    
    E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() ))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns() 
    for x in "ACGT": 
        for y in range(0,2):
            profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n )


    profile_mali = Mali.convertMali2Alignlib( profile_mali )
    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() )

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile( profile_mali )
    
    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull( alignment_mode,
                                              -5.0,
                                              -0.5 )
    
    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal( 0, n, 0 )
    build_mali.add( src_mali, m )

    outf = open( outfile, "w" )
    outf_log = open( outfile + ".info", "w" )
    outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append( re.sub( "-", "", mali[pid] ) )
        ids.append( pid )

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator( open(infile)):

        E.debug("adding %s" % s.title )
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence( s.sequence )
        rseq = alignlib.makeSequence( rsequence )

        alignator.align( map_seq2profile, seq, profile )
        alignator.align( map_rseq2profile, rseq, profile )

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score: 
            c.skipped += 1
            continue

        r = getParts( m )

        covered = 0
        for mm in r:
            build_mali.add( mm )
            sequences.append( sequence )
            ids.append( s.title )
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write( "\t".join( map(str, (
                        s.title,
                        len(s.sequence),
                        m.getRowFrom(),
                        m.getRowTo(),
                        len(r),
                        covered,
                        "%5.2f" % (100.0 * covered / len(s.sequence) ),
                        m.getScore(),
                        m.getColFrom(),
                        m.getColTo(),
                        mali_covered,
                        "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())
                        ) ) ) + "\n" )

        c.output += 1

    #build_mali.expand( aa )
    result = str(alignlib.MultAlignmentFormatPlain( build_mali, 
                                                    sequences, 
                                                    alignlib.UnalignedStacked ))

    for pid, data in zip(ids, result.split("\n") ):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) )


    outf.close()
    outf_log.close()

    E.info( "%s\n" % str(c) )