Exemplo n.º 1
0
def analyzeMali(mali, options, prefix_row=""):

    if len(mali) == 0:
        raise "not analyzing empty multiple alignment"

    # count empty sequences
    row_data = map(
        lambda x: Mali.MaliData(x.mString, options.gap_chars, options.
                                mask_chars), mali.values())
    col_data = map(
        lambda x: Mali.MaliData(x, options.gap_chars, options.mask_chars),
        mali.getColumns())

    if len(row_data) == 0 or len(col_data) == 0:
        return False

    if options.loglevel >= 2:
        for row in row_data:
            options.stdlog.write("# row: %s\n" % str(row))
        for col in col_data:
            options.stdlog.write("# col: %s\n" % str(col))

    options.stdout.write(prefix_row)

    # calculate average column occupancy
    col_mean = scipy.mean(map(lambda x: x.mNChars, col_data))
    col_median = scipy.median(map(lambda x: x.mNChars, col_data))
    length = mali.getLength()

    if float(int(col_median)) == col_median:
        options.stdout.write("%5.2f\t%5.2f\t%i\t%5.2f" %
                             (col_mean, 100.0 * col_mean / length, col_median,
                              100.0 * col_median / length))
    else:
        options.stdout.write("%5.2f\t%5.2f\t%5.1f\t%5.2f" %
                             (col_mean, 100.0 * col_mean / length, col_median,
                              100.0 * col_median / length))

    row_mean = scipy.mean(map(lambda x: x.mNChars, row_data))
    row_median = scipy.median(map(lambda x: x.mNChars, row_data))
    width = mali.getWidth()

    if float(int(row_median)) == row_median:
        options.stdout.write("\t%5.2f\t%5.2f\t%i\t%5.2f" %
                             (row_mean, 100.0 * row_mean / width, row_median,
                              100.0 * row_median / width))
    else:
        options.stdout.write("\t%5.2f\t%5.2f\t%5.1f\t%5.2f" %
                             (row_mean, 100.0 * row_mean / width, row_median,
                              100.0 * row_median / width))

    options.stdout.write("\n")

    return True
Exemplo n.º 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2profiles.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    (options, args) = E.Start(parser)

    mali = Mali.SequenceCollection()
    last_id = None
    ninput, noutput, nskipped = 0, 0, 0

    for line in sys.stdin:
        if line[0] == "#":
            continue

        start, ali, end, id = line[:-1].split("\t")
        ninput += 1
        if id != last_id:
            if last_id:
                mali.setName(last_id)
                mali.writeToFile(sys.stdout, format="profile")
                noutput += 1
            mali = Mali.SequenceCollection()
            last_id = id

        mali.addSequence(id, start, end, ali)

    if last_id:
        mali.setName(last_id)
        mali.writeToFile(sys.stdout, format="profile")
        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Exemplo n.º 3
0
def runXrate(mali, has_non_overlaps, pairs, map_old2new, options):
    """run xrate on a multiple alignment."""

    ids = mali.getIdentifiers()

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    ## do pairwise run
    for x, y in pairs:
        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

        ## remove empty columns and masked columns
        if options.clean_mali:
            temp_mali.mGapChars = temp_mali.mGapChars + ("n", "N")
            temp_mali.removeGaps(minimum_gaps=1, frame=3)

        if temp_mali.getWidth() < options.min_overlap:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# pair %s-%s: not computed because only %i residues overlap\n"
                    % (mali.getEntry(ids[x]).mId, mali.getEntry(
                        ids[y]).mId, temp_mali.getWidth()))

            nskipped += 1
            continue

        if options.xrate_model in ("sn", ):
            runXrateSN(xgram, temp_mali, options)
        elif options.xrate_model in ("akaksgc"):
            runXrateAKaKsGc(xgram, temp_mali, options)
        else:
            runXrateF3X4(xgram, temp_mali, options)

        if options.loglevel >= 1 and ninput % options.report_step == 0:
            options.stdlog.write(
                "# pairwise computation: %i/%i -> %i%% in %i seconds.\n" %
                (ninput, ntotal, 100.0 * ninput / ntotal,
                 time.time() - tstart))
            options.stdlog.flush()

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# pairwise computation: ninput=%i, noutput=%i, nskipped=%i\n" %
            (ninput, noutput, nskipped))
        options.stdlog.flush()
Exemplo n.º 4
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: jalview.py 2782 2009-09-10 11:40:29Z andreas $", usage = globals()["__doc__"] )

    parser.add_option("-m", "--method", dest="method", type="choice", 
                      choices=("list2annotation", ),
                      help="methods.")

    parser.add_option("--filename-mali", dest="filename_mali", type="string",
                      help="filename with multiple alignment used for calculating sites - used for filtering" )
    
    parser.add_option("--jalview-title", dest="jalview_title", type="string",
                      help="title for jalview annotation." )
    

    parser.set_defaults(
        method = None,
        jalview_symbol = "*",
        jalview_title = "anno",
        filename_mali = None,
        )

    (options, args) = E.Start( parser, add_pipe_options = True )

    if not options.filename_mali:
        raise "please specify a multiple alignment."
    
    mali = Mali.Mali()
    mali.readFromFile( open(options.filename_mali, "r") )

    if options.method == "list2annotation":

        options.stdout.write("JALVIEW_ANNOTATION\n" )
        options.stdout.write("# Created: %s\n\n" % (time.asctime(time.localtime(time.time()))))

        codes = [""] * mali.getWidth()

        first = True
        for line in sys.stdin:
            if line[0] == "#": continue
            if first:
                first= False
                continue

            position = int(line[:-1].split("\t")[0])
            codes[position-1] = options.jalview_symbol
            
        options.stdout.write("NO_GRAPH\t%s\t%s\n" % (options.jalview_title, "|".join( codes ) ))
            
    E.Stop()
Exemplo n.º 5
0
    def loadPair(self, seq1, seq2):

        temp_mali = Mali.Mali()
        temp_mali.addSequence("seq1", 0, len(seq1), seq1)
        temp_mali.addSequence("seq2", 0, len(seq2), seq2)

        try:
            self.mResult = self.mBaseml.Run(temp_mali,
                                            tree="(seq1,seq2);",
                                            dump=self.mDump,
                                            test=self.mTest)
        except WrapperCodeML.UsageError:
            self.mResult = None
Exemplo n.º 6
0
def getMali(mali, columns, block_size=1):

    new_mali = Mali.Mali()

    for id, val in mali.items():
        sequence = val.mString
        chars = []
        for c in columns:
            chars.append(sequence[c * block_size:c * block_size + block_size])

        new_sequence = "".join(chars)
        new_mali.addSequence(id, 0, mali.countCharacters(new_sequence),
                             new_sequence)

    return new_mali
Exemplo n.º 7
0
    def Run(self, mali,
            tree=None,
            dump=0,
            test=False,
            options={}):

        self.mTempdir = tempfile.mkdtemp()
        self.mFilenameInput = "input"
        self.mFilenameOutput = "output"

        if test:
            print("# temporary directory is %s" % self.mTempdir)

        mali.writeToFile(open(self.mTempdir + "/" + self.mFilenameInput, "w"),
                         format="fasta")

        statement = " ".join((self.mExecutable,
                              "-in %s" % self.mFilenameInput,
                              "-out %s" % self.mFilenameOutput))

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             cwd=self.mTempdir,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise UsageError("Error in running %s \n%s\n%s\nTemporary directory in %s" % (
                self.mExecutable, err, out, self.mTempdir))

        if dump:
            print("# stdout output of %s:\n%s\n######################################" % (
                self.mExecutable, out))

        result = Mali.Mali()

        result.readFromFile(
            open("%s/%s" % (self.mTempdir, self.mFilenameOutput), "r"), format="fasta")

        if not test:
            shutil.rmtree(self.mTempdir)

        return result
Exemplo n.º 8
0
def filterMali(mali, method="3rd"):
    """build a new multiple alignment based on a filter.

    valid methods are
    3rd:        only third positions
    4d:         only four-fold degenerate sites
    """

    if method not in ("3rd", "4d"):
        raise "unknown method %s" % method

    if method == "3rd":
        columns = range(2, mali.getWidth(), 3)

    elif method == "4d":
        # translate
        trans_mali = Mali.Mali()
        for id, seq in mali.items():
            s = []
            sequence = seq.mString
            l = len(sequence)
            for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                aa = Genomics.MapCodon2AA(codon)
                s.append(aa)

            trans_mali.addSequence(id, 0, l, "".join(s))

        # get four-fold (or higher) degenerate amino acids
        aa_columns = trans_mali.getColumns()
        columns = []
        for c in range(len(aa_columns)):
            chars = set(aa_columns[c])
            chars = chars.difference(set(mali.mGapChars))
            if len(chars) == 1:
                char = list(chars)[0].upper()
                try:
                    deg = Genomics.DegeneracyAA[char]
                except KeyError:
                    continue
                if deg >= 4:
                    columns.append(c * 3)

    mali.takeColumns(columns)
Exemplo n.º 9
0
def outputAnnotations( result, options ):
    """output the annotations in the model."""

    mali = Mali.Mali()

    mali.readFromFile( result.getData(), format="stockholm" )
    annotation = mali.getAnnotation( "STATE" )
    
    l,c,f = 0, None, []
    for x in annotation:
        if x != c:
            if c: f.append( "%s:%i" % (c,l) )
            c = x
            l = 0
        l += 1
        
    f.append( "%s:%i" % (c,l) )

    options.stdout.write( "\t%s" % ",".join(f))
Exemplo n.º 10
0
    def create(self, infile):
        """create profile library from file."""

        self.mOutfileDatabase = open(self.mFilenameProfiles, "wb")
        outfile_index = open(self.mFilenameIndex, "w")

        ninput, noutput = 0, 0

        while mali.readFromFile(sys.stdin, format="profile"):

            ninput += 1

            m = Mali.convertMali2Alignlib(mali)
            p = alignlib_lite.py_makeProfile(m, weightor=self.mWeightor)
            p.prepare()

            self.appendProfile(mali.getName(), p)

            noutput += 1

        return ninput, noutput
Exemplo n.º 11
0
    def create(self, infile):
        """create profile library from file."""

        self.mOutfileDatabase = open(self.mFilenameProfiles, "wb")
        outfile_index = open(self.mFilenameIndex, "w")

        ninput, noutput = 0, 0

        while mali.readFromFile(sys.stdin, format="profile"):

            ninput += 1

            m = Mali.convertMali2Alignlib(mali)
            p = alignlib_lite.py_makeProfile(m, weightor=self.mWeightor)
            p.prepare()

            self.appendProfile(mali.getName(), p)

            noutput += 1

        return ninput, noutput
Exemplo n.º 12
0
    def verify(self, infile):
        """verify data in database against original data."""

        if not self.mIndex:
            self.__loadIndex()

        ninput, nfound, nnotfound, ndifferent = 0, 0, 0, 0
        while mali.readFromFile(sys.stdin, format="profile"):

            ninput += 1
            m = Mali.convertMali2Alignlib(mali)
            p1 = alignlib_lite.py_makeProfile(m)
            p1.prepare()

            p2 = self.getProfile(mali.getName())

            if p1.getLength() != p2.getLength() or \
                    str(p1) != str(p2):
                ndifferent += 1
                continue

            nfound += 1

        return ninput, nfound, nnotfound, ndifferent
Exemplo n.º 13
0
    def verify(self, infile):
        """verify data in database against original data."""

        if not self.mIndex:
            self.__loadIndex()

        ninput, nfound, nnotfound, ndifferent = 0, 0, 0, 0
        while mali.readFromFile(sys.stdin, format="profile"):

            ninput += 1
            m = Mali.convertMali2Alignlib(mali)
            p1 = alignlib_lite.py_makeProfile(m)
            p1.prepare()

            p2 = self.getProfile(mali.getName())

            if p1.getLength() != p2.getLength() or \
                    str(p1) != str(p2):
                ndifferent += 1
                continue

            nfound += 1

        return ninput, nfound, nnotfound, ndifferent
Exemplo n.º 14
0
def _alignToProfile( infile, outfile, 
                     min_score = 0 ):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile( open("../data/mouse.fasta") )
    src_mali = Mali.convertMali2Alignlib( mali )
    
    E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() ))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns() 
    for x in "ACGT": 
        for y in range(0,2):
            profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n )


    profile_mali = Mali.convertMali2Alignlib( profile_mali )
    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() )

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile( profile_mali )
    
    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull( alignment_mode,
                                              -5.0,
                                              -0.5 )
    
    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal( 0, n, 0 )
    build_mali.add( src_mali, m )

    outf = open( outfile, "w" )
    outf_log = open( outfile + ".info", "w" )
    outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append( re.sub( "-", "", mali[pid] ) )
        ids.append( pid )

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator( open(infile)):

        E.debug("adding %s" % s.title )
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence( s.sequence )
        rseq = alignlib.makeSequence( rsequence )

        alignator.align( map_seq2profile, seq, profile )
        alignator.align( map_rseq2profile, rseq, profile )

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score: 
            c.skipped += 1
            continue

        r = getParts( m )

        covered = 0
        for mm in r:
            build_mali.add( mm )
            sequences.append( sequence )
            ids.append( s.title )
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write( "\t".join( map(str, (
                        s.title,
                        len(s.sequence),
                        m.getRowFrom(),
                        m.getRowTo(),
                        len(r),
                        covered,
                        "%5.2f" % (100.0 * covered / len(s.sequence) ),
                        m.getScore(),
                        m.getColFrom(),
                        m.getColTo(),
                        mali_covered,
                        "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())
                        ) ) ) + "\n" )

        c.output += 1

    #build_mali.expand( aa )
    result = str(alignlib.MultAlignmentFormatPlain( build_mali, 
                                                    sequences, 
                                                    alignlib.UnalignedStacked ))

    for pid, data in zip(ids, result.split("\n") ):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) )


    outf.close()
    outf_log.close()

    E.info( "%s\n" % str(c) )
Exemplo n.º 15
0
def getMali(component_id, map_component2seq_id, map_component2input_id,
            id_filter, options):

    global master_mali

    rx_component = re.compile(options.pattern_component)

    mali = Mali.Mali()

    nsubstitutions = len(re.findall("%s", options.pattern_mali))

    input_id = rx_component.search(component_id).groups()[0]
    input_id = map_component2input_id[input_id]

    if nsubstitutions == 0:

        if master_mali == None:

            master_mali = Mali.Mali()

            E.debug("retrieving multiple alignment from file %s" %
                    (options.pattern_mali))

            master_mali.readFromFile(open(options.pattern_mali, "r"),
                                     format=options.input_format)

        for s in map_component2seq_id[component_id]:

            if options.pattern_filter and id_filter:
                f = re.search(options.pattern_filter, s).groups()[0]

                if f not in id_filter:
                    E.debug("removing %s from %s: not in filter" %
                            (f, component_id))
                    continue

            if options.output_format == "codeml":
                if len(master_mali[s]) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s is not a multiple of 3: %i" %
                        (s, len(master_mali[s])))

            if s in mali:
                if options.skip_doubles:
                    E.warn("skipped double entry %s in component %s" %
                           (s, component_id))
                    return None
                else:
                    raise ValueError("duplicate entry %s in component %s" %
                                     (s, component_id))

            mali.addEntry(master_mali.getEntry(s))

    else:

        input_filename = options.pattern_mali % tuple(
            [input_id] * nsubstitutions)

        E.debug("retrieving multiple alignment for component %s from file %s" %
                (component_id, input_filename))

        if not os.path.exists(input_filename):
            if options.ignore_missing:
                E.warn("alignment %s not found" % input_filename)
                return None
            else:
                raise OSError("alignment %s not found" % input_filename)

        mali.readFromFile(open(input_filename, "r"),
                          format=options.input_format)

        ## get identifiers (and make a copy)
        s = tuple(mali.getIdentifiers())
        for ss in s:
            if options.pattern_filter and id_filter:
                f = re.search(options.pattern_filter, ss).groups()[0]

                if f not in id_filter:
                    mali.deleteEntry(ss)
                    if options.loglevel >= 5:
                        options.stdlog.write(
                            "# removing %s from %s: not in filter.\n" %
                            (ss, component_id))
                    continue

            if ss not in map_component2seq_id[component_id]:
                if options.loglevel >= 5:
                    options.stdlog.write(
                        "# removing %s from %s: not in component list.\n" %
                        (ss, component_id))
                mali.deleteEntry(ss)
            else:
                if options.output_format == "codeml":
                    if len(mali[ss]) % 3 != 0:
                        raise "length of sequence %s is not a multiple of 3: %i" % (
                            ss, len(mali[ss]))

    mali.setName(component_id)

    return mali
Exemplo n.º 16
0
    if options.filename_map:
        map_species2sp = IOTools.ReadMap(open(options.filename_map, "r"))

    E.debug("species map: %s" % str(map_species2sp))

    identifier_parser = IdentifierParserGPipe(map_species2sp=map_species2sp)

    njtree = NJTree(identifier_parser=identifier_parser)

    njtree.SetLog(options.stdlog)
    njtree.SetErr(options.stderr)

    if options.filename_tree:
        njtree.SetSpeciesTree(options.filename_tree)

    mali = Mali.Mali()
    if options.filename_alignment == "-":
        infile = sys.stdin
    else:
        infile = open(options.filename_alignment, "r")

    mali.readFromFile(infile, format="fasta")

    if mali.getLength() == 1:
        if options.loglevel >= 1:
            options.stdlog.write("# Warning: single gene tree\n")
        options.stdout.write("(%s:1);\n" % tuple(mali.getIdentifiers()))
    elif mali.getLength() == 2:
        if options.loglevel >= 1:
            options.stdlog.write("# Warning: two gene tree\n")
        options.stdout.write("(%s:1,%s:1);\n" % tuple(mali.getIdentifiers()))
Exemplo n.º 17
0
def selectPositiveSites(results, selection_mode, options, mali=None):
    """returns sites, which are consistently estimated to be positively selected.

    Depending on the option selection_mode, various sites are selected:

    'all': all positive sites are returned
    'consistent': only positive sites that are positive in all models and runs
    'emes': only sites that are > 0.9 in one model and at least > 0.5 in all other models

    If mali is given, positions that are not fully aligned are removed.

    """

    ## filter and extract functions
    if selection_mode == "emes":
        filter_f = lambda x: x.mProbability >= 0.5 and x.mOmega >= options.filter_omega
    else:
        filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega

    extract_f = lambda x: x.mResidue

    ## maximum significance per site (for emes)
    max_per_site = {}

    total_sites = set()

    first = True

    for result in results:

        for model in options.models:

            sites = result.mSites[model]

            s1, s2 = set(), set()
            if "neb" in options.analysis:
                s1 = set(
                    map(extract_f, filter(filter_f,
                                          sites.mNEB.mPositiveSites)))
                for x in filter(filter_f, sites.mNEB.mPositiveSites):
                    if x.mResidue not in max_per_site:
                        max_per_site[x.mResidue] = 0
                    max_per_site[x.mResidue] = max(x.mProbability,
                                                   max_per_site[x.mResidue])

            if "beb" in options.analysis:
                s2 = set(
                    map(extract_f, filter(filter_f,
                                          sites.mBEB.mPositiveSites)))
                for x in filter(filter_f, sites.mBEB.mPositiveSites):
                    if x.mResidue not in max_per_site:
                        max_per_site[x.mResidue] = 0
                    max_per_site[x.mResidue] = max(x.mProbability,
                                                   max_per_site[x.mResidue])

            s = s1.union(s2)

            if first:
                total_sites = s
                first = False
            else:
                if selection_mode == "all":
                    total_sites = total_sites.union(s)
                elif selection_mode == "consistent":
                    total_sites = total_sites.intersection(s)
                elif selection_mode == "emes":
                    total_sites = total_sites.intersection(s)

    if selection_mode == "emes":
        if options.loglevel >= 2:
            options.stdlog.write(
                "# before EMES filtering %i positive sites: mode %s, P>%5.2f\n"
                % (len(total_sites), selection_mode, 0.5))

        # filter according to emes: maximum significance larger than 0.9
        total_sites = set(filter(lambda x: max_per_site[x] > 0.9, total_sites))

        if options.loglevel >= 2:
            options.stdlog.write(
                "# after EMES filtering %i positive sites: mode %s, P>%5.2f\n"
                % (len(total_sites), selection_mode, 0.9))

    else:
        if options.loglevel >= 2:
            options.stdlog.write(
                "# extracted %i positive sites: mode %s, P>%5.2f\n" %
                (len(total_sites), selection_mode, options.filter_probabiltiy))

    if mali and options.filter_mali:
        if options.filter_mali == "gaps":
            nfiltered = 0
            mali_length = mali.getLength()

            column_data = map(
                lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."),
                mali.getColumns())
            new_sites = set()

            for x in total_sites:

                ## PAML uses one-based coordinates
                column = column_data[x - 1]

                if column.mNChars != mali_length:
                    nfiltered += 1
                    if options.loglevel >= 3:
                        options.stdlog.write(
                            "# rejected position %i due to mali\n" % x)
                    continue

                new_sites.add(x)

            total_sites = new_sites

        if options.loglevel >= 2:
            options.stdlog.write("# after MALI filtering %i positive sites\n" %
                                 (len(total_sites)))

    return total_sites, max_per_site
Exemplo n.º 18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("--methods",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("summary-numbers", "jalview",
                               "positive-site-table", "positive-site-list",
                               "count-positive-sites"),
                      help="methods for analysis.")

    parser.add_option("--selection-mode",
                      dest="selection_mode",
                      type="choice",
                      choices=("all", "consistent", "emes"),
                      help="how to select positive sites.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix for rows.")

    parser.add_option("--pattern-input-filenames",
                      dest="pattern_input_filenames",
                      type="string",
                      help="input pattern.")

    parser.add_option(
        "--filter-probability",
        dest="filter_probability",
        type="float",
        help=
        "threshold for probability above which to include positive sites [default=%default]."
    )

    parser.add_option(
        "--filter-omega",
        dest="filter_omega",
        type="float",
        help=
        "threshold for omega above which to include positive sites [default=%default]."
    )

    parser.add_option("--models",
                      dest="models",
                      type="string",
                      help="restrict output to set of site specific models.")

    parser.add_option("--analysis",
                      dest="analysis",
                      type="string",
                      help="restrict output to set of analysis [beb|neb].")

    parser.add_option("--significance-threshold",
                      dest="significance_threshold",
                      type="float",
                      help="significance threshold for log-likelihood test.")

    parser.add_option("--filter-mali",
                      dest="filter_mali",
                      type="choice",
                      choices=("none", "gaps"),
                      help="filter by mali to remove gapped positions.")

    parser.add_option(
        "--filename-mali",
        dest="filename_mali",
        type="string",
        help=
        "filename with multiple alignment used for calculating sites - used for filtering"
    )

    parser.add_option(
        "--filename-map-mali",
        dest="filename_map_mali",
        type="string",
        help="filename with multiple alignment to map sites onto.")

    parser.add_option(
        "--jalview-titles",
        dest="jalview_titles",
        type="string",
        help="comma separated list of jalview annotation titles.")

    parser.add_option("--jalview-symbol",
                      dest="jalview_symbol",
                      type="string",
                      help="symbol to use in jalview.")

    parser.set_defaults(
        methods=[],
        prefix=None,
        filter_probability=0,
        filter_omega=0,
        models="",
        analysis="",
        significance_threshold=0.05,
        selection_mode="consistent",
        filename_mali=None,
        filename_map_mali=None,
        jalview_symbol="*",
        jalview_titles="",
        filter_mali=None,
    )

    (options, args) = E.Start(parser)

    if options.jalview_titles:
        options.jalview_titles = options.jalview_titles.split(",")
    else:
        options.jalview_titles = args

    options.models = options.models.split(",")
    options.analysis = options.analysis.split(",")

    for a in options.analysis:
        if a not in ("beb", "neb"):
            raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a

    for a in options.models:
        if a not in ("8", "2", "3"):
            raise "unknown model: '%s', possible values are 2, 3, 8" % a

    codeml = WrapperCodeML.CodeMLSites()

    ## filter and extract functions
    filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega
    extract_f = lambda x: x.mResidue

    ## read multiple results
    results = []
    ninput, noutput, nskipped = 0, 0, 0

    headers = []
    for f in args:
        ninput += 1
        try:
            results.append(codeml.parseOutput(open(f, "r").readlines()))
        except WrapperCodeML.UsageError:
            if options.loglevel >= 1:
                options.stdlog.write("# no input from %s\n" % f)
            nskipped += 1
            continue
        noutput += 1
        headers.append(f)

    ## map of nested model (key) to more general model
    map_nested_models = {'8': '7', '2': '1', '3': '0'}

    if options.filename_mali:
        mali = Mali.Mali()
        mali.readFromFile(open(options.filename_mali, "r"))
    else:
        mali = None

    ###############################################################
    ###############################################################
    ###############################################################
    ## use multiple alignment to map residues to a reference mali
    ## or a sequence.
    ###############################################################
    if options.filename_map_mali:

        if not mali:
            raise "please supply the input multiple alignment, if residues are to be mapped."

        ## translate the alignments
        def translate(s):
            sequence = s.mString
            seq = []
            for codon in [
                    sequence[x:x + 3] for x in range(0, len(sequence), 3)
            ]:
                aa = Genomics.MapCodon2AA(codon)
                seq.append(aa)

            s.mString = "".join(seq)

        tmali = Mali.Mali()
        tmali.readFromFile(open(options.filename_mali, "r"))
        tmali.apply(translate)

        tmap_mali = Mali.Mali()
        tmap_mali.readFromFile(open(options.filename_map_mali, "r"))

        if tmap_mali.getAlphabet() == "na":
            tmap_mali.apply(translate)

        map_old2new = alignlib_lite.py_makeAlignmentVector()

        mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali))

        if tmap_mali.getLength() == 1:

            s = tmap_mali.values()[0].mString
            mali2 = alignlib_lite.py_makeSequence(s)
            ## see if you can find an identical subsequence and then align to thisD
            for x in tmali.values():
                if s in re.sub("[- .]+", "", x.mString):
                    mali1 = alignlib_lite.py_makeSequence(x.mString)
                    break
        else:
            mali2 = alignlib_lite.py_makeProfileFromMali(
                convertMali2Mali(tmap_mali))

        alignator = alignlib_lite.py_makeAlignatorDPFull(
            alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0)
        alignator.align(map_old2new, mali1, mali2)

        consensus = tmap_mali.getConsensus()

        if options.loglevel >= 4:
            options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet())
            options.stdlog.write("# orig  : %s\n" % tmali.getConsensus())
            options.stdlog.write("# mapped: %s\n" % consensus)
            options.stdlog.write("# alignment: %s\n" % map_old2new.Write())
    else:
        map_old2new = None

    for method in options.methods:

        if method == "summary-numbers":

            options.stdlog.write( \
"""# Numbers of positive sites.
#
# The consistent row/column contains positive sites that are significant
# (above thresholds for probability and omega) for all models/analysis
# that have been selected (label: cons).
#
# The log-likelihood ratio test is performed for model pairs, depending
# on the output chosen.
# Significance threshold: %6.4f
# The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0.
#
""" % options.significance_threshold )

            ## write header
            if options.prefix: options.stdout.write("prefix\t")

            options.stdout.write("method\tnseq\t")
            h = []
            for model in options.models:
                for analysis in options.analysis:
                    h.append("%s%s" % (analysis, model))
                h.append("p%s" % (model))
                h.append("df%s" % (model))
                h.append("chi%s" % (model))
                h.append("lrt%s" % (model))

            options.stdout.write("\t".join(h))
            options.stdout.write("\tcons\tpassed\tfilename\n")

            nmethod = 0

            consistent_cols = [None for x in range(len(options.analysis))]
            passed_tests = {}
            for m in options.models:
                passed_tests[m] = 0

            for result in results:

                row_consistent = None

                if options.prefix:
                    options.stdout.write("%s" % (options.prefix))

                options.stdout.write("%i" % nmethod)
                options.stdout.write("\t%i" % (result.mNumSequences))

                npassed = 0

                for model in options.models:

                    sites = result.mSites[model]

                    ## do significance test
                    full_model, null_model = model, map_nested_models[model]

                    lrt = Stats.doLogLikelihoodTest(
                        result.mSites[full_model].mLogLikelihood,
                        result.mSites[full_model].mNumParameters,
                        result.mSites[null_model].mLogLikelihood,
                        result.mSites[null_model].mNumParameters,
                        options.significance_threshold)

                    x = 0
                    for analysis in options.analysis:

                        if analysis == "neb":
                            s = set(
                                map(
                                    extract_f,
                                    filter(filter_f,
                                           sites.mNEB.mPositiveSites)))

                        elif analysis == "beb":
                            s = set(
                                map(
                                    extract_f,
                                    filter(filter_f,
                                           sites.mBEB.mPositiveSites)))

                        options.stdout.write("\t%i" % (len(s)))

                        if not lrt.mPassed:
                            s = set()

                        if row_consistent == None:
                            row_consistent = s
                        else:
                            row_consistent = row_consistent.intersection(s)

                        if consistent_cols[x] == None:
                            consistent_cols[x] = s
                        else:
                            consistent_cols[x] = consistent_cols[
                                x].intersection(s)

                        x += 1

                    if lrt.mPassed:
                        c = "passed"
                        passed_tests[model] += 1
                        npassed += 1
                    else:
                        c = "failed"

                    options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" %\
                                         (lrt.mProbability,
                                          lrt.mDegreesFreedom,
                                          lrt.mChiSquaredValue,
                                          c))

                options.stdout.write(
                    "\t%i\t%i\t%s\n" %
                    (len(row_consistent), npassed, headers[nmethod]))

                nmethod += 1

            if options.prefix:
                options.stdout.write("%s\t" % options.prefix)

            options.stdout.write("cons")

            row_consistent = None
            total_passed = 0
            for model in options.models:

                x = 0

                for analysis in options.analysis:

                    s = consistent_cols[x]
                    if s == None:
                        s = set()

                    options.stdout.write("\t%i" % (len(s)))

                    if row_consistent == None:
                        row_consistent = s
                    else:
                        row_consistent = row_consistent.intersection(s)

                    x += 1

                options.stdout.write("\tna\t%i" % passed_tests[model])
                total_passed += passed_tests[model]

            options.stdout.write("\t%i\t%i\n" %
                                 (len(row_consistent), total_passed))

        elif method == "jalview":

            options.stdout.write("JALVIEW_ANNOTATION\n")
            options.stdout.write("# Created: %s\n\n" %
                                 (time.asctime(time.localtime(time.time()))))

            l = 1
            x = 0
            for result in results:

                sites, significance = selectPositiveSites(
                    [result], options.selection_mode, options, mali)

                codes = [""] * result.mLength

                if len(sites) == 0: continue

                for site in sites:
                    codes[site - 1] = options.jalview_symbol

                options.stdout.write(
                    "NO_GRAPH\t%s\t%s\n" %
                    (options.jalview_titles[x], "|".join(codes)))
                x += 1

        elif method == "count-positive-sites":

            sites, significance = selectPositiveSites(results,
                                                      options.selection_mode,
                                                      options, mali)

            options.stdout.write("%i\n" % (len(sites)))

        elif method in ("positive-site-table", ):

            sites, significance = selectPositiveSites(results,
                                                      options.selection_mode,
                                                      options, mali)

            headers = ["site", "P"]
            if map_old2new:
                headers.append("mapped")
                headers.append("Pm")

            options.stdout.write("\t".join(headers) + "\n")

            sites = list(sites)
            sites.sort()
            nmapped, nunmapped = 0, 0
            for site in sites:
                values = [site, "%6.4f" % significance[site]]

                if map_old2new:
                    r = map_old2new.mapRowToCol(site)
                    if r == 0:
                        values.append("na")
                        values.append("")
                        nunmapped += 1
                        if options.loglevel >= 2:
                            options.stdlog.write("# unmapped residue: %i\n" %
                                                 site)
                    else:
                        values.append(r)
                        values.append(consensus[r - 1])
                        nmapped += 1

                options.stdout.write("\t".join(map(str, (values))) + "\n")

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sites: ninput=%i, noutput=%i, nskipped=%i\n" %
                    (len(sites), nmapped, nunmapped))

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
Exemplo n.º 19
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-i",
        "--input-format",
        dest="input_format",
        type="choice",
        choices=("plain", "fasta", "clustal", "stockholm", "phylip"),
        help="input format of multiple alignment [default=%default].")

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("plain", "fasta", "stockholm", "phylip", "nexus",
                 "plain-fasta"),
        help="output format of multiple alignment [default=%default].")

    parser.add_option(
        "--with-ranges",
        dest="with_ranges",
        action="store_true",
        help=
        "output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option(
        "--without-ranges",
        dest="with_ranges",
        action="store_false",
        help=
        "do not output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option("-u",
                      "--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="string",
        help=
        """methods to apply. Several methods can be specified in a ','-separated list [default=%default]."""
    )

    parser.add_option(
        "-p",
        "--parameters",
        dest="parameters",
        type="string",
        help="parameter stack for methods that require one [default=%default]."
    )

    parser.add_option(
        "-a",
        "--mask-char",
        dest="mask_char",
        type="string",
        help="character to identify/set masked characters [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        methods="",
        parameters="",
        mask_char="x",
        gap_chars="-.nN",
        with_ranges=True,
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    t1 = time.time()

    mali.readFromFile(options.stdin, format=options.input_format)

    E.info("read mali with %i entries in %i seconds." %
           (len(mali), time.time() - t1))

    if len(mali) == 0:
        raise ValueError("empty multiple alignment")

    for method in options.methods:

        t1 = time.time()

        if method == "remove-unaligned-ends":
            mali.removeUnalignedEnds()
        elif method == "remove-end-gaps":
            mali.removeEndGaps()
        elif method == "remove-all-gaps":
            mali.removeGaps(minimum_gaps=len(mali))
        elif method == "remove-any-gaps":
            mali.removeGaps(minimum_gaps=1)
        elif method == "remove-some-gaps":
            minimum_gaps = int(options.parameters[0])
            del options.parameters[0]
            mali.removeGaps(minimum_gaps=minimum_gaps)
        elif method == "remove-empty-sequences":
            mali.removeEmptySequences()
        elif method == "upper":
            mali.upperCase()
        elif method == "lower":
            mali.lowerCase()
        elif method == "mark-codons":
            mali.markCodons()
        elif method == "remove-stops":
            mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"),
                               allowed_matches=0,
                               minimum_matches=1,
                               delete_frame=3,
                               search_frame=3)
        elif method == "shift-alignment":
            map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"),
                                            map_functions=(str, int))
            del options.parameters[0]
            mali.shiftAlignment(map_id2offset)
        elif method == "propagate-masks":
            mali.propagateMasks(mask_char=options.mask_char)

        elif method == "recount":
            mali.recount()

        elif method in ("mark-transitions", "filter-odd-transitions",
                        "filter-even-transitions", "keep-even-segments",
                        "keep-odd-segments"):

            if os.path.exists(options.parameters[0]):
                map_id2transitions = IOTools.readMultiMap(
                    open(options.parameters[0], "r"), map_functions=(str, int))
            else:
                map_id2transitions = {}
                r = map(int, options.parameters[0].split(':'))
                r.sort()
                map_id2transitions["mali"] = r

            del options.parameters[0]
            if method == "mark-transitions":
                mali.markTransitions(map_id2transitions)
            elif method in ("filter-odd-transitions", "keep-even-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-odd")
            elif method in ("filter-even-transitions", "keep-odd-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-even")

        elif method == "propagate-transitions":
            mali.propagateTransitions()

        elif method == "map-annotation":
            # map annotations in one mali (stockholm-format) to the annotations in another.
            # Note: the first two sequence identifiers must be shared and the sequence of the
            # same length
            other_mali = Mali.Mali()
            other_mali.readFromFile(open(options.parameters[0], "r"),
                                    format="stockholm")
            del options.parameters[0]
            mali.copyAnnotations(other_mali)

        elif method == "add-annotation":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            AddAnnotation(mali, annotation_type, annotation_file)

        elif method == "mask-columns":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            maskColumns(mali, annotation_type, annotation_file)

        elif method == "remove-unaligned-pairs":
            removeUnalignedPairs(mali, options)

        elif method == "filter-3rd":
            filterMali(mali, "3rd")

        elif method == "filter-4d":
            filterMali(mali, "4d")

        elif method in ("mask-seg", "mask-bias"):
            a, b = method.split("-")
            maskMali(mali, b)

        elif method == "exclude-with-stop":
            mali.filter(method="with-stop")

        elif method == "exclude-with-stop":
            mali.filter(method="with-frameshift")

        E.info("applied method %s in %i seconds." % (method, time.time() - t1))

    mali.writeToFile(options.stdout,
                     format=options.output_format,
                     write_ranges=options.with_ranges)

    E.Stop()
Exemplo n.º 20
0
def runXrate(mali, pairs, options):

    from XGram.Generator.Prebuilt import DNA
    from XGram.Model import Annotation
    import XGram.Run

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    tempdir = tempfile.mkdtemp()
    data = tempdir + "/data"

    if options.distance == "K80":
        model = DNA.buildModel(substitution_model="k80")
    elif options.distance == "JC69":
        model = DNA.buildModel(substitution_model="jc69")
    elif options.distance == "REV":
        model = DNA.buildModel(substitution_model="gtr")
    else:
        raise "distance %s not implemented for xrate" % (options.distance)

    writeModel(model, "input", options)

    if options.output_format == "list":
        options.stdout.write("\t".join(
            ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg")))

        if options.with_counts:
            options.stdout.write("\t%s" %
                                 Genomics.SequencePairInfo().getHeader())
        options.stdout.write("\n")

    for x, y in pairs:

        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

        # if temp_mali.getWidth() < options.min_overlap:
        # if options.loglevel >= 1:
        # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId,
        # mali.getEntry(ids[y]).mId,
        # temp_mali.getWidth()) )

        ##             nskipped += 1
        # continue

        outfile = open(data, "w")
        temp_mali.writeToFile(outfile,
                              format="stockholm",
                              write_ranges=False,
                              options=("#=GF NH (%s:1.0)%s;" %
                                       tuple(temp_mali.getIdentifiers()), ))
        outfile.close()

        o_alpha, o_kappa = "na", "na"
        o_distance = "na"
        msg = ""

        if options.test_xrate:
            for alpha in (0.1, 0.5, 1.0, 1.5):
                for beta in (0.1, 0.5, 1.0, 1.5):
                    model.mGrammar.setParameter("alpha", alpha)
                    model.mGrammar.setParameter("beta", beta)
                    result = xgram.train(model, data)
                    trained_model = result.getModel()
                    xalpha, xbeta = \
                        (trained_model.mGrammar.getParameter('alpha'),
                         trained_model.mGrammar.getParameter('beta'))
                    # this assumes that the branch length in the input is normalized to 1
                    # this is the normalization constant
                    o_distance = options.format % (2 * xbeta + xalpha)
                    o_kappa = options.format % (xalpha / xbeta)

                    msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta)

                    options.stdout.write("\t".join(
                        ("%f" % alpha, "%f" % beta, o_distance,
                         options.format % result.getLogLikelihood(), o_alpha,
                         o_kappa, msg)))
                    options.stdout.write("\n")
            continue

        options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId))

        if options.distance in ("K80", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()

        elif options.distance in ("REV", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()
            alpha, beta, gamma, delta, epsilon, theta = \
                (trained_model.mGrammar.getParameter('alpha'),
                 trained_model.mGrammar.getParameter('beta'),
                 trained_model.mGrammar.getParameter('gamma'),
                 trained_model.mGrammar.getParameter('delta'),
                 trained_model.mGrammar.getParameter('epsilon'),
                 trained_model.mGrammar.getParameter('theta'))

            pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )]
            matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )]
            q, d = RateEstimation.getDistanceGTR(pi, matrix)
            o_distance = options.format % (d)
            o_kappa = ""
            msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % (
                alpha, beta, gamma, delta, epsilon, theta)

        elif options.distance in ('JC69', ):
            result = xgram.buildTree(model, data)

        if options.distance == "K80":
            alpha, beta = \
                (trained_model.mGrammar.getParameter('alpha'),
                    trained_model.mGrammar.getParameter('beta'))
            # this assumes that the branch length in the input is normalized to 1
            # this is the normalization constant
            o_distance = options.format % (2 * beta + alpha)
            o_kappa = options.format % (alpha / beta)

            msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta)
            alpha = "na"

        elif options.distance == "JC69":

            tree = result.getTree()
            # multiply distance by tree, as rates are set to 1 and
            # thus the matrix is scaled by a factor of 3
            o_distance = options.format % (
                3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0]))
            o_kappa = "na"
            msg = ""

        writeModel(result.mModel, "trained", options)

        options.stdout.write("\t".join(
            (o_distance, options.format % result.getLogLikelihood(), o_alpha,
             o_kappa, msg)))

        if options.with_counts:
            info = Genomics.CalculatePairIndices(mali[ids[x]],
                                                 mali[ids[y]],
                                                 with_codons=options.is_codons)
            options.stdout.write("\t%s" % (str(info)))

        options.stdout.write("\n")

    shutil.rmtree(tempdir)
Exemplo n.º 21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2predictions.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-l",
                      "--filename-locations",
                      dest="filename_locations",
                      type="string",
                      help="filename with locations")

    parser.add_option("-m",
                      "--master",
                      dest="master",
                      type="string",
                      help="the master determines the frame.")

    parser.set_defaults(filename_locations=None, gap_chars="-.", master=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    mali = Mali.Mali()

    mali.readFromFile(sys.stdin)

    identifiers = mali.getIdentifiers()

    aligned_columns, aligned_exons = getAlignedColumns(mali, options)

    map_id2location = {}

    if options.filename_locations:
        map_id2location = IOTools.ReadMap(open(options.filename_locations,
                                               "r"))

    options.stdout.write(Prediction.Prediction().getHeader() + "\n")

    nid = 1

    for identifier in identifiers:

        if options.loglevel >= 2:
            options.stdlog.write("# processing %s\n" % (identifier))

        entry = mali.getEntry(identifier)

        sequence = entry.mString
        if sequence[0] not in string.lowercase:
            raise "all sequences should start with an exon."

        was_exon = True
        d = 0
        alignment = []
        carry_over = 0

        last_codon = []
        codon = []
        nchars_in_codon = 0
        n = 0

        last_master_residue = 0
        master_residue = 0
        for column in range(len(sequence)):

            c = sequence[column]
            is_gap = c in options.gap_chars
            is_aligned = column in aligned_columns
            is_exon = column in aligned_exons

            if is_gap:
                continue

            if is_exon:
                master_residue = aligned_exons[column]
                codon.append((n, master_residue))

            n += 1

            # check if we have a complete codon
            if is_exon:
                # A codon is complete, if it ends at frame 2 or
                # it spans more than one codons in the master.
                # Gaps in the master that are a multiple of 3 are ignored
                d = master_residue - last_master_residue - 1

                if master_residue % 3 == 2 or (d % 3 != 0 and d > 0):

                    if last_codon:
                        d = codon[0][0] - last_codon[-1][0] - 1
                        if d > 0:
                            # add in-frame introns
                            if d > 10:
                                alignment.append(["5", 0, 2])
                                alignment.append(["I", 0, d - 4])
                                alignment.append(["3", 0, 2])
                            else:
                                raise "untreated case"

                    alignment += processCodon(codon)
                    last_codon = codon
                    codon = []

            last_master_residue = master_residue

        last = alignment[0]
        new_alignment = []
        for this in alignment[1:]:
            if this[0] == last[0]:
                last[1] += this[1]
                last[2] += this[2]
                continue

            new_alignment.append(last)
            last = this

        new_alignment.append(last)

        if options.loglevel >= 4:
            options.stdlog.write("# output=%s\n" % (str(new_alignment)))

        assert (new_alignment[-1][2] % 3 == 0)

        lalignment = sum(map(lambda x: x[2], new_alignment))

        prediction = Prediction.Prediction()

        prediction.mQueryToken = identifier

        genomic_sequence = re.sub("[%s]" % options.gap_chars, "",
                                  mali[identifier])

        prediction.mPredictionId = nid
        nid += 1

        if identifier in map_id2location:

            prediction.mSbjctToken, prediction.mSbjctStrand, sfrom, sto = map_id2location[
                identifier].split(":")[:4]

            prediction.mSbjctGenomeFrom = int(sfrom) + entry.mFrom
            prediction.mSbjctGenomeTo = int(sto)

        else:
            prediction.mSbjctToken = "unk"
            prediction.mSbjctStrand = "+"
            prediction.mSbjctGenomeFrom = 0

        prediction.mQueryCoverage = 100
        prediction.mPercentIdentity = 100
        prediction.mPercentSimilarity = 100

        prediction.mQueryLength = prediction.mQueryTo

        prediction.mSbjctGenomeTo = prediction.mSbjctGenomeFrom + lalignment

        prediction.mMapPeptide2Genome = new_alignment
        prediction.mAlignmentString = string.join(
            map(lambda x: string.join(map(str, x), " "),
                prediction.mMapPeptide2Genome), " ")

        prediction.mMapPeptide2Translation, prediction.mTranslation = Genomics.Alignment2PeptideAlignment(
            prediction.mMapPeptide2Genome, 0, 0, genomic_sequence)

        (prediction.mNIntrons, prediction.mNFrameShifts, prediction.mNGaps, prediction.mNSplits, prediction.mNStopCodons, disruptions) = \
            Genomics.CountGeneFeatures(0,
                                       prediction.mMapPeptide2Genome,
                                       genomic_sequence)

        options.stdout.write(str(prediction) + "\n")

    E.Stop()
Exemplo n.º 22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2kaks.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("--set-omega",
                      dest="omega",
                      type="float",
                      help="initial omega value.")

    parser.add_option("--set-kappa",
                      dest="kappa",
                      type="float",
                      help="initial kappa value.")

    parser.add_option("--fix-kappa",
                      dest="fix_kappa",
                      action="store_true",
                      help="do not estimate kappa.")

    parser.add_option("--fix-omega",
                      dest="fix_omega",
                      action="store_true",
                      help="do not estimate omega.")

    parser.add_option("--set-codon-frequencies",
                      dest="codon_frequencies",
                      type="choice",
                      choices=("uniform", "fequal", "f3x4", "f1x4", "f61"),
                      help="set codon frequencies.")

    parser.add_option("--set-method",
                      dest="paml_method",
                      type="int",
                      help="set paml optimization method [0|1].")

    parser.add_option("--set-sequence-type",
                      dest="seqtype",
                      type="choice",
                      choices=("codon", "aa", "trans"),
                      help="sequence type.")

    parser.add_option(
        "--set-clean-data",
        dest="clean_data",
        type="choice",
        choices=("0", "1"),
        help=
        "PAML should cleanup data:  0=only gaps within pair are removed, 1=columns in the mali with gaps are removed."
    )

    parser.add_option("--dump",
                      dest="dump",
                      action="store_true",
                      help="dump raw output [%default].")

    parser.add_option("--set-optimization-threshold",
                      dest="optimization_threshold",
                      type="string",
                      help="set paml optimization threshold [%default].")

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm",
                               "phylip"),
                      help="input format of multiple alignment [%default].")

    parser.add_option("--pairwise",
                      dest="pairwise",
                      action="store_true",
                      help="force pairwise comparison [%default].")

    parser.add_option("--iteration",
                      dest="iteration",
                      type="choice",
                      choices=("all-vs-all", "first-vs-all", "pairwise",
                               "tree"),
                      help="iteration mode [%default].")

    parser.add_option(
        "--no-clean",
        dest="clean_mali",
        action="store_false",
        help=
        "do not clean multiple alignment before submitting to codeml. It might take too long for very large sequences."
    )

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("paml", "xrate"),
                      help="choose method for rate computation [%default]")

    parser.add_option("--xrate-model",
                      dest="xrate_model",
                      type="choice",
                      choices=("f3x4-two", "f3x4-four", "sn", "akaksgc",
                               "ef3x4-four", "f3x4-fourproducts"),
                      help="models to use [%default].")

    parser.add_option("-w",
                      "--write",
                      dest="write",
                      type="choice",
                      action="append",
                      choices=("input_fixed", "trained_fixed",
                               "input_variable", "trained_variable", "all"),
                      help="output sections to write [%default].")

    parser.add_option("-o",
                      "--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="output pattern for output files [%default].")

    parser.add_option("--xrate-insert-frequencies",
                      dest="xrate_insert_frequencies",
                      action="store_true",
                      help="estimate codon frequencies from input [%default].")

    parser.add_option("--xrate-uniform-frequencies",
                      dest="xrate_insert_frequencies",
                      action="store_false",
                      help="use uniform codon frequencies [%default].")

    parser.add_option("--xrate-fix-frequencies",
                      dest="xrate_fix_frequencies",
                      action="store_true",
                      help="set initial frequencies to const [%default].")

    parser.add_option("--xrate-estimate-frequencies",
                      dest="xrate_fix_frequencies",
                      action="store_false",
                      help="estimate nucleotide frequencies [%default].")

    parser.add_option(
        "--xrate-fix-rates",
        dest="fix_rates",
        type="string",
        help=
        """fix rates to specified values. Note that the number of rates has to match the ones
in the model. Provide values in a comma-separated list [%default].""")

    parser.add_option(
        "--xrate-min-increment",
        dest="xrate_min_increment",
        type=float,
        help="minimum increment to stop iteration in xrate [%default].")

    parser.add_option(
        "--min-overlap",
        dest="min_overlap",
        type="int",
        help="minimum overlap between a sequence pair in residues [%default].")

    parser.add_option(
        "--with-rho",
        dest="with_rho",
        action="store_true",
        help=
        "output rho values (substitution rates per codon). This requires a patched version of PAML [%default]."
    )

    parser.add_option(
        "--with-counts",
        dest="with_counts",
        action="store_true",
        help=
        "output counts of aligned positions, transitions and transversions [%default]."
    )

    parser.add_option("--remove-stops",
                      dest="remove_stops",
                      action="store_true",
                      help="remove stop codons [%default].")

    parser.add_option(
        "--replicates",
        dest="replicates",
        type="int",
        help="in benchmarking mode expect ## replicates [%default].")

    parser.add_option("--tree",
                      dest="tree",
                      type="string",
                      help="use tree for estimation [%default].")

    parser.set_defaults(
        input_format="fasta",
        omega=None,
        codon_frequencies=None,
        paml_method=None,
        optimization_threshold=None,
        seqtype="codon",
        dump=False,
        clean_data=False,
        min_overlap=60,
        gap_chars="-.",
        mask_chars="nN",
        pairwise=False,
        kappa=None,
        fix_kappa=False,
        fix_omega=False,
        clean_mali=True,
        method="paml",
        report_step=1000,
        loglevel=1,
        xrate_insert_frequencies=False,
        xrate_fix_frequencies=False,
        write=[],
        output_pattern="%s.eg",
        value_format="%6.4f",
        fix_rates=None,
        xrate_from_parameters=False,
        xrate_model="f3x4-four",
        with_rho=False,
        with_counts=False,
        iteration="all-vs-all",
        remove_stops=False,
        xrate_min_increment=0.000001,
        replicates=None,
        tree=None,
    )

    (options, args) = E.Start(parser)

    if options.method == "xrate":
        # imports for xrate computation
        from XGram.Generator.Prebuilt import Codons
        from XGram.Model import Annotation
        import XGram.Run
        import Bio.Data.CodonTable

        # paml like estimation using xrate
        if options.codon_frequencies == "uniform":
            options.xrate_fix_frequencies = True
            options.xrate_insert_frequencies = False
        elif options.codon_frequencies == "f3x4":
            options.xrate_fix_frequencies = True
            options.xrate_insert_frequencies = True
    elif options.method == "paml":
        if not options.codon_frequencies:
            options.codon_frequencies = "F3X4"

    if options.fix_rates:
        options.fix_rates = map(float, options.fix_rates.split(","))

    if options.pairwise or options.replicates:
        ## read sequences, but not as a multiple alignment. This permits multiple names.
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    mali.readFromFile(sys.stdin, format=options.input_format)

    E.info("read multiple alignment")

    if mali.getLength() == 0:
        raise "refusing to process empty alignment."

    ################################################################
    ################################################################
    ################################################################
    ## setup methods
    ################################################################

    options.stdout.write(
        "seq1\tseq2\tdN\tdS\tdNdS\tN\tS\tdN_err\tdS_err\tkappa\tlnL\ttau")

    if options.with_rho:
        options.stdout.write("\trN\trS\tt\trN0\trS0\tt0")

    if options.with_counts:
        options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader())

    options.stdout.write("\terror_str\n")

    if options.replicates != None:
        ids = mali.getIdentifiers()
        assert (len(ids) % options.replicates == 0)
        s = len(ids) / options.replicates
        for x in range(0, len(ids), s):
            m = Mali.Mali()
            for id in ids[x:x + s]:
                m.addEntry(mali.getEntry(id))
            processMali(m, options)
    else:
        processMali(mali, options)

    E.Stop()
Exemplo n.º 23
0
 def __init__(self):
     self.mStockholm = None
     self.mMali = Mali.Mali()
Exemplo n.º 24
0
def ProcessResult(result, options, mali=None, prefix=None, p_value=None):

    counts = None

    if options.method == "summary-slr":

        thresholds = "95%", "99%", "95% corrected", "99% corrected"

        if prefix:
            options.stdout.write("%s\t" % prefix)

        options.stdout.write("%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t" % (
            result.mTreeLength,
            result.mOmega,
            result.mKappa,
            result.mLogLikelihood,
            len(result.mSites),
            result.mNSitesSynonymous,
            result.mNSitesGaps + result.mNSitesSingleChar,
        ))
        options.stdout.write("\t".join(
            map(lambda x: "%i" % result.mNPositiveSites[x][0], thresholds)))
        options.stdout.write("\t")
        options.stdout.write("\t".join(
            map(lambda x: "%i" % result.mNNegativeSites[x], thresholds)))
        options.stdout.write("\n")

    elif options.method in ("summary-filtered", "positive-site-table",
                            "negative-site-table", "neutral-site-table",
                            "positive-site-list", "negative-site-list",
                            "neutral-site-list"):

        mali_length = mali.getLength()
        mali_width = mali.getWidth()
        column_data = map(
            lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."),
            mali.getColumns())

        # sanity check: do lengths of mali and # of sites correspond
        if len(result.mSites) * 3 != mali_width:
            raise "mali (%i) and # of sites (%i) do not correspond." % (
                mali_width, len(result.mSites))

        if options.method == "summary-filtered":
            # count sites, but filter with multiple alignment
            ntotal = 0
            npositive = 0
            nnegative = 0
            nneutral = 0
            nfiltered = 0
            nsynonymous = 0

            if prefix:
                options.stdout.write("%s\t" % prefix)

            for x in range(len(result.mSites)):
                site = result.mSites[x]
                column = column_data[x * 3]

                if column.mNChars != mali_length:
                    nfiltered += 1
                    continue

                if site.isPositive(options.significance_threshold,
                                   options.use_adjusted):
                    npositive += 1
                elif site.isNegative(options.significance_threshold,
                                     options.use_adjusted):
                    nnegative += 1

                if site.isSynonymous():
                    nsynonymous += 1

                ntotal += 1

            options.stdout.write(
                "%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t%i\t%i\t%i\n" %
                (result.mTreeLength, result.mOmega, result.mKappa,
                 result.mLogLikelihood, len(result.mSites), nfiltered, ntotal,
                 nsynonymous, nnegative, npositive))
            counts = Result(nfiltered, ntotal, nsynonymous, nnegative,
                            npositive)

        elif options.method in (
                "positive-site-table",
                "negative-site-table",
                "neutral-site-table",
                "positive-site-list",
                "negative-site-list",
                "neutral-site-list",
        ):

            select_positive_sites = options.method in ("positive-site-table",
                                                       "positive-site-list")
            select_negative_sites = options.method in ("negative-site-table",
                                                       "negative-site-list")

            # iterate over sites and output those under xxx selection
            identifiers = mali.getIdentifiers()
            chars_per_row = [[] for x in range(mali_length)]

            sites = []

            for col in range(len(result.mSites)):

                site = result.mSites[col]
                column = column_data[col * 3]

                if column.mNChars != mali_length:
                    continue

                keep = False

                if select_positive_sites and site.isPositive(
                        options.significance_threshold, options.use_adjusted):
                    keep = True

                elif select_negative_sites and site.isNegative(
                        options.significance_threshold, options.use_adjusted):
                    keep = True

                if not keep:
                    continue

                sites.append((col, site))

            nsites = len(sites)

            if options.truncate_sites_list:
                # truncate sites list, sort by significance
                sites.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue))
                sites = sites[:options.truncate_sites_list]

            for col, site in sites:

                site = result.mSites[col]
                xcol = col * 3

                for row in range(mali_length):
                    id = identifiers[row]
                    x = max(xcol - options.context_size * 3, 0)
                    y = min(xcol + 3 + options.context_size * 3, mali_width)
                    segment = mali[id][x:y]
                    codon = mali[id][xcol:xcol + 3]
                    pos = mali.getResidueNumber(id, xcol)
                    pos /= 3

                    # save as real-world coordinates
                    chars_per_row[row].append(
                        PositionInformation(
                            Genomics.MapCodon2AA(codon), pos + 1, xcol,
                            Genomics.TranslateDNA2Protein(segment).upper()))

            if p_value is not None:
                pp_value = p_value
            else:
                pp_value = "na"

            if options.method in ("positive-site-table", "negative-site-table",
                                  "neutral-site-table"):

                if options.context_size:
                    for row in range(mali_length):
                        if prefix:
                            options.stdout.write("%s\t" % prefix)

                        options.stdout.write(
                            "%s\t%i\t%s\t%s\n" %
                            (identifiers[row], nsites, pp_value, ";".join([
                                "%s%i in %s" %
                                (x.mAA, x.mSequencePosition, x.mContext)
                                for x in chars_per_row[row]
                            ])))
                else:
                    for row in range(mali_length):
                        if prefix:
                            options.stdout.write("%s\t" % prefix)

                        options.stdout.write(
                            "%s\t%i\t%s\t%s\n" %
                            (identifiers[row], nsites, pp_value, ";".join([
                                "%s%i" % (x.mAA, x.mSequencePosition)
                                for x in chars_per_row[row]
                            ])))

            elif options.method in ("positive-site-list", "negative-site-list",
                                    "neutral-site-list"):

                for row in range(mali_length):

                    if prefix:
                        xprefix = "%s\t%s" % (prefix, identifiers[row])
                    else:
                        xprefix = "%s" % (identifiers[row])
                    x = 0
                    for chars in chars_per_row[row]:
                        x += 1
                        options.stdout.write(
                            "%s\t%i\t%s\t%i\t%i\t%s\n" %
                            (xprefix, x, chars.mAA, chars.mSequencePosition,
                             chars.mMaliPosition, chars.mContext))

    options.stdout.flush()

    return counts
Exemplo n.º 25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2rates.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm",
                               "phylip"),
                      help="input format of multiple alignment")

    parser.add_option(
        "-s",
        "--sites",
        dest="sites",
        type="string",
        help="sites to use [default=%default].",
    )

    parser.add_option(
        "-f",
        "--file",
        dest="filename",
        type="string",
        help="filename of multiple alignment (- for stdin) [default=%default].",
        metavar="FILE")

    parser.add_option("-o",
                      "--format",
                      dest="format",
                      type="string",
                      help="format [default=%default].",
                      metavar="format")

    parser.add_option(
        "-d",
        "--distance",
        dest="distance",
        type="choice",
        choices=("PID", "T92", "JC69", "POVL", "F84", "LogDet", "K80", "F81",
                 "HKY85", "TN93", "REV", "UNREST", "REVU", "UNRESTU", "JTT",
                 "PMB", "PAM", "Kimura", "CategoriesModel"),
        help="method to use for distance calculation [default=%default].")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("phylip", "baseml", "own", "xrate"),
                      help="program to use for rate calculation.")

    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("list", "tree"),
                      help="output format.")

    parser.add_option(
        "-m",
        "--min-sites",
        dest="min_sites",
        type="int",
        help="minimum number of sites for output[default=%default].",
    )

    parser.add_option(
        "-a",
        "--alphabet",
        dest="alphabet",
        type="choice",
        choices=("aa", "na", "auto"),
        help="alphabet to use.",
    )

    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree information.")

    parser.add_option("--set-alpha",
                      dest="alpha",
                      type="float",
                      help="initial alpha value.")

    parser.add_option("--fix-alpha",
                      dest="fix_alpha",
                      action="store_true",
                      help="do not estimate alpha.")

    parser.add_option("--set-kappa",
                      dest="kappa",
                      type="float",
                      help="initial kappa value.")

    parser.add_option("--fix-kappa",
                      dest="fix_kappa",
                      action="store_true",
                      help="do not estimate kappa.")

    parser.add_option("--dump",
                      dest="dump",
                      action="store_true",
                      help="dump output.")

    parser.add_option("--test",
                      dest="test",
                      action="store_true",
                      help="test run - does not clean up.")

    parser.add_option("--pairwise",
                      dest="pairwise",
                      action="store_true",
                      help="force pairwise comparison.")

    parser.add_option(
        "--set-clean-data",
        dest="clean_data",
        type="choice",
        choices=("0", "1"),
        help=
        "PAML should cleanup data:  0=only gaps within pair are removed, 1=columns in the mali with gaps are removed."
    )

    parser.add_option(
        "--with-counts",
        dest="with_counts",
        action="store_true",
        help=
        "output counts of aligned positions, transitions and transversions.")

    parser.add_option("-w",
                      "--write",
                      dest="write",
                      type="choice",
                      action="append",
                      choices=("input", "trained", "all"),
                      help="output sections to write for xrate.")

    parser.add_option("--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="output pattern for output files.")

    parser.add_option("--xrate-min-increment",
                      dest="xrate_min_increment",
                      type=float,
                      help="minimum increment to stop iteration in xrate.")

    parser.set_defaults(
        input_format="fasta",
        filename_tree=None,
        with_counts=False,
        sites="d4",
        distance="T92",
        min_sites=1,
        filename="-",
        alphabet="auto",
        format="%6.4f",
        method="phylip",
        kappa=None,
        fix_kappa=False,
        alpha=None,
        fix_alpha=False,
        dump=False,
        clean_data=None,
        output_format="list",
        iteration="all-vs-all",
        pairwise=False,
        report_step=1000,
        output_pattern="%s.eg",
        write=[],
        test_xrate=False,
        xrate_min_increment=None,
        is_codons=False,
    )

    (options, args) = E.Start(parser)

    if options.filename != "-":
        infile = open(options.filename, "r")
    else:
        infile = sys.stdin

    # read multiple alignment
    if options.pairwise:
        # read sequences, but not as a multiple alignment. This permits
        # multiple names.
        mali = Mali.SequenceCollection()
        options.iteration = "pairwise"
    else:
        mali = Mali.Mali()

    mali.readFromFile(infile, format=options.input_format)

    ids = mali.getIdentifiers()

    if options.alphabet == "auto":
        s = "".join(map(lambda x: x.mString, mali.values())).lower()
        ss = re.sub("[acgtxn]", "", s)
        if float(len(ss)) < (len(s) * 0.1):
            options.alphabet = "na"
            if mali.getNumColumns() % 3 == 0:
                options.is_codons = True
        else:
            options.alphabet = "aa"

        if options.loglevel >= 1:
            options.stdlog.write("# autodetected alphabet: %s\n" %
                                 options.alphabet)

    if options.filename != "-":
        infile.close()

    npairs = 0
    nskipped_length = 0
    nskipped_distance = 0

    pairs = []
    if options.iteration == "all-vs-all":
        for x in range(len(ids) - 1):
            for y in range(x + 1, len(ids)):
                pairs.append((x, y))
    elif options.iteration == "first-vs-all":
        for y in range(1, len(ids)):
            pairs.append((0, y))
    elif options.iteration == "pairwise":
        if len(ids) % 2 != 0:
            raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len(
                ids)
        for x in range(0, len(ids), 2):
            pairs.append((x, x + 1))

    if options.alphabet == "na":

        if options.method == "baseml":
            runBaseML(mali, pairs, options)
        elif options.method == "phylip" and options.distance in ("F84", "K80",
                                                                 "JC69",
                                                                 "LogDet"):
            runDNADIST(mali, pairs, options)
        elif options.method == "xrate":
            runXrate(mali, pairs, options)
        else:
            if options.is_codons:
                h = Genomics.SequencePairInfoCodons().getHeader()
            else:
                h = Genomics.SequencePairInfo().getHeader()
            options.stdout.write("seq1\tseq2\tdist\tvar\t%s\n" % (h))

            for x, y in pairs:
                id_x = ids[x]
                npairs += 1

                id_y = ids[y]

                info = Genomics.CalculatePairIndices(
                    mali[id_x], mali[id_y], with_codons=options.is_codons)

                if options.distance in ("T92", "JC69"):
                    if options.sites == "d4":
                        seq1, seq2 = Genomics.GetDegenerateSites(mali[id_x],
                                                                 mali[id_y],
                                                                 position=3,
                                                                 degeneracy=4)

                        if len(seq1) < options.min_sites:
                            nskipped_length += 1
                            continue
                    else:
                        raise "unknown sites %s" % options.sites

                if options.distance == "T92":
                    distance, variance = CalculateDistanceT92(info)
                elif options.distance == "JC69":
                    distance, variance = CalculateDistanceJC69(info)
                elif options.distance == "PID":
                    distance, variance = CalculateDistancePID(
                        mali[id_x], mali[id_y])
                elif options.distance == "POVL":
                    distance, variance = CalculateDistancePOVL(
                        mali[id_x], mali[id_y])

                if distance >= 0:
                    options.stdout.write("\t".join(
                        map(str, (id_x, id_y, options.format % distance,
                                  options.format % variance, info))) + "\n")
                else:
                    nskipped_distance += 1

    elif options.alphabet == "aa":

        if options.distance in ("JTT", "PMB", "PAM", "Kimura",
                                "CategoriesModel"):

            # use phylip for these
            phylip = WrapperPhylip.Phylip()
            phylip.setProgram("protdist")
            phylip.setMali(mali)

            phylip_options = []
            if options.distance == "PMG":
                phylip_options += ["D"] * 1
            elif options.distance == "PAM":
                phylip_options += ["D"] * 2
            elif options.distance == "Kimura":
                phylip_options += ["D"] * 3
            elif options.distance == "CategoriesModel":
                phylip_options += ["D"] * 4

            phylip_options.append("Y")
            phylip.setOptions(phylip_options)
            result = phylip.run()

            writePhylipResult(result, options)

        else:
            options.stdout.write("id1\tid2\tdist\tvar\n")

            # iterate over all pairs of sequences
            for x, y in pairs:
                id_x = ids[x]
                npairs += 1

                id_y = ids[y]

                if options.distance == "PID":
                    distance, variance = CalculateDistancePID(
                        mali[id_x], mali[id_y])
                elif options.distance == "POVL":
                    # percentage overlap
                    distance, variance = CalculateDistancePOVL(
                        mali[id_x], mali[id_y])

                if distance >= 0:
                    options.stdout.write("\t".join(
                        (id_x, id_y, options.format % distance,
                         options.format % variance)) + "\n")
                else:
                    nskipped_distance += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# nseqs=%i, npairs=%i, nskipped_length=%i, nskipped_distance=%i\n"
            % (len(ids), npairs, nskipped_length, nskipped_distance))

    E.Stop()
Exemplo n.º 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2bootstrap.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm",
                               "phylip"),
                      help="input format of multiple alignment")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("plain", "fasta", "stockholm", "phylip"),
                      help="output format of multiple alignment")

    parser.add_option(
        "-p",
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help=
        "pattern for output filenames. Should contain a %(id)i. If not given, the output is to stdout with --separator [default=%default]."
    )

    parser.add_option("-n",
                      "--samples",
                      dest="samples",
                      type="int",
                      help="number of samples.")

    parser.add_option(
        "-r",
        "--no-replacement",
        dest="no_replacement",
        type="int",
        help=
        "sample without replacement. The parameter gives the size of the multiple alignment [default=%default]."
    )

    parser.add_option("-b",
                      "--block-size",
                      dest="block_size",
                      type="int",
                      help="block size. Use 3 for sampling from codons.")

    parser.set_defaults(input_format="fasta",
                        output_format="fasta",
                        samples=10,
                        block_size=1,
                        output_filename_pattern=None,
                        no_replacement=None,
                        separator="//")

    (options, args) = E.Start(parser)

    mali = Mali.Mali()

    mali.readFromFile(sys.stdin, format=options.input_format)

    for x in range(options.samples):

        if options.no_replacement != None:
            new_mali = getSampledMali(mali, options.no_replacement,
                                      options.block_size)
        else:
            new_mali = getBootstrappedMali(mali, options.block_size)

        if options.output_filename_pattern:

            filename = options.output_filename_pattern % {"id": x + 1}
            target_directory = os.path.dirname(filename)
            if not os.path.exists(target_directory):
                os.makedirs(target_directory)
            outfile = open(filename, "w")
            E.info("creating mali %s" % filename)
        else:
            outfile = options.stdout

        new_mali.writeToFile(outfile, format=options.output_format)

        if outfile == sys.stdout:
            if options.separator and x < options.samples - 1:
                options.stdout.write(options.separator + "\n")
        else:
            outfile.close()

    E.Stop()
Exemplo n.º 27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--gop",
                      dest="gop",
                      type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("-e",
                      "--gep",
                      dest="gep",
                      type="float",
                      help="gap extension penalty [default=%default].")

    parser.add_option(
        "-m",
        "--mode",
        dest="mode",
        type="choice",
        choices=("global", "local"),
        help="alignment mode, global=nw, local=sw [default=%default].")

    parser.set_defaults(
        gop=-12.0,
        gep=-2.0,
        format="fasta",
        mode="local",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info("read 2 multiple alignments")

    mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format)
    mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format)

    cmali1 = Mali.convertMali2Alignlib(mali1)
    cmali2 = Mali.convertMali2Alignlib(mali2)

    if options.mode == "local":
        mode = alignlib_lite.py_ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib_lite.py_ALIGNMENT_GLOBAL

    alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop,
                                                     options.gep)

    alignlib_lite.py_setDefaultEncoder(
        alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20))
    alignlib_lite.py_setDefaultLogOddor(
        alignlib_lite.py_makeLogOddorDirichlet(0.3))
    alignlib_lite.py_setDefaultRegularizor(
        alignlib_lite.py_makeRegularizorDirichletPrecomputed())

    cprofile1 = alignlib_lite.py_makeProfile(cmali1)
    cprofile2 = alignlib_lite.py_makeProfile(cmali2)

    result = alignlib_lite.py_makeAlignmentVector()

    alignator.align(result, cprofile1, cprofile2)

    E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result))

    cmali1.add(cmali2, result)

    outmali = Mali.convertAlignlib2Mali(cmali1,
                                        identifiers=mali1.getIdentifiers() +
                                        mali2.getIdentifiers())

    outmali.writeToFile(options.stdout, format=options.format)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 28
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/prune_multiple_alignment.py 2654 2009-05-06 13:51:22Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--master",
                      dest="master",
                      type="string",
                      help="master sequence.")

    parser.add_option("-p",
                      "--master-pattern",
                      dest="master_pattern",
                      type="string",
                      help="master pattern.")

    parser.add_option("--master-species",
                      dest="master_species",
                      type="string",
                      help="species to use as master sequences.")

    parser.add_option("-t",
                      "--translate",
                      dest="filename_translation",
                      type="string",
                      help="filename on where to store translated sequences.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename on where to exon information.")

    parser.add_option("-c",
                      "--mark-codons",
                      dest="mark_codons",
                      action="store_true",
                      help="mark codons.")

    parser.add_option(
        "-i",
        "--ignore-case",
        dest="ignore_case",
        action="store_true",
        help="ignore case (otherwise: lowercase are unaligned chars).")

    parser.add_option("--remove-stops",
                      dest="remove_stops",
                      action="store_true",
                      help="remove stop codons.")

    parser.add_option("--mask-stops",
                      dest="mask_stops",
                      action="store_true",
                      help="mask stop codons.")

    parser.add_option("--mask-char",
                      dest="mask_char",
                      type="string",
                      help="masking character to use.")

    parser.add_option("-f",
                      "--remove-frameshifts",
                      dest="remove_frameshifts",
                      action="store_true",
                      help="remove columns corresponding to frameshifts.")

    parser.add_option(
        "--mask-master",
        dest="mask_master",
        action="store_true",
        help=
        "columns in master to be removed are masked to keep residue numbering."
    )

    parser.add_option(
        "-s",
        "--split-exons",
        dest="split_exons",
        action="store_true",
        help="split columns aligned to different exons in the same gene.")

    parser.add_option("-a",
                      "--target",
                      dest="target",
                      type="choice",
                      choices=("paml", ),
                      help="perform cleaning up for certain targets.")

    parser.set_defaults(
        gap_char="-",
        mask_char="n",
        gap_chars="-.",
        separator="|",
        master=None,
        master_species=None,
        filename_translation=None,
        filename_exons=None,
        master_pattern=None,
        remove_stops=False,
        mark_codons=False,
        mask_unaligned=False,
        split_exons=False,
        remove_frameshifts=False,
        min_segment_length=5,
        ignore_case=False,
        mask_stops=False,
        target=None,
        mask_master=False,
    )

    (options, args) = E.Start(parser)

    if options.target == "paml":
        options.mask_stops = True
        options.mask_char = "n"
        options.remove_frameshifts = True

        if options.loglevel >= 1:
            options.stdlog.write(
                "# setting output to paml : removing frameshifts, masking stops with '%s'.\n"
                % (options.mask_char))

    ## 1. read multiple alignment in fasta format
    mali = Mali.Mali()

    mali.readFromFile(sys.stdin)

    if options.loglevel >= 1:
        options.stdlog.write("# read mali with %i entries.\n" % len(mali))

    if len(mali) == 0:
        raise "empty multiple alignment"

    identifiers = mali.getIdentifiers()

    masters = []
    if options.master:
        masters = options.master.split(",")
    elif options.master_pattern:
        for id in identifiers:
            if re.search(options.master_pattern, id):
                masters.append(id)
    elif options.master_species:
        for id in identifiers:
            if options.master_species == id.split(options.separator)[0]:
                masters.append(id)
    else:
        masters.append(identifiers[0])

    if options.loglevel >= 2:
        options.stdlog.write("# master sequences are: %s\n" % str(masters))
        options.stdlog.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                         filter=set(identifiers),
                                         from_zero=True)

        if options.loglevel >= 2:
            options.stdlog.write("# read exons %i sequences.\n" % len(exons))
    else:
        exons = {}

    #################################################################################
    #################################################################################
    #################################################################################
    ## translate characters to upper/lower case according to exon info.
    #################################################################################
    if exons:
        for id in identifiers:
            if id in exons:
                mali.getSequence(id).mString = AddExonInformation(
                    mali[id], exons[id], mask_char=options.mask_char)

    elif options.ignore_case:
        ## convert all to uppercase
        mali.upper()

    #################################################################################
    #################################################################################
    #################################################################################
    ## untangle misaligned exons
    #################################################################################
    if exons and options.split_exons:

        ## first split with masters
        if len(masters) > 0:
            SplitExons(mali, exons, masters=masters, options=options)

            if options.loglevel >= 4:
                mali.writeToFile(open("log_mali1", "w"), format="fasta")

        SplitExons(mali, exons, options)

    #################################################################################
    #################################################################################
    #################################################################################
    ## remove frameshifts
    #################################################################################
    if options.remove_frameshifts:
        out_of_frame_columns = []
        if len(masters) == 1:

            frame_columns = GetFrameColumns(mali,
                                            masters[0],
                                            gap_chars=options.gap_chars)

        else:

            columns = []

            for id in masters:
                columns += GetFrameColumns(mali,
                                           id,
                                           gap_chars=options.gap_chars)

            if len(columns) == 0:
                columns += GetFrameColumns(mali,
                                           identifiers[0],
                                           gap_chars=options.gap_chars)

            # sort all columns by tuple. The "shortest" codon will be first: (1,2,3) before (1,2,100),
            # and (1,2,100) before (1,3,4).
            columns.sort(lambda x, y: cmp((x[0], x[2]), (y[0], y[2])))

            # select codons
            frame_columns = []
            last_codon = columns[0]

            for codon in columns[1:]:
                # skip identical codons
                if codon == last_codon: continue

                # take first (shortest) codon in case of identical first residue
                if codon[0] == last_codon[0]:
                    continue

                # if not overlapping, keep
                if codon[0] > last_codon[2]:
                    frame_columns.append(last_codon)
                else:
                    out_of_frame_columns += last_codon

                # if overlapping, but out of register: skip
                last_codon = codon

            frame_columns.append(last_codon)

        # build set of skipped columns
        frame_set = set()
        for column in frame_columns:
            for c in column:
                frame_set.add(c)

        # columns that contain a master sequence that is out of
        # frame
        out_of_frame_set = set(out_of_frame_columns)
        out_of_frame_set = out_of_frame_set.difference(frame_set)

        if options.loglevel >= 1:
            options.stdlog.write("# found %i/%i columns in frame\n" %
                                 (len(frame_columns) * 3, mali.getWidth()))

            if options.loglevel >= 5:
                options.stdlog.write("# frame columns: %i\n" %
                                     (len(frame_columns)))
                x = 0
                for column in frame_columns:
                    options.stdlog.write("# %i\t%s\n" %
                                         (x, ",".join(map(str, column))))
                    x += 1

            if options.loglevel >= 5:
                options.stdlog.write(
                    "# Out-of frame columns with residue of masters: %i\n" %
                    (len(out_of_frame_set)))
                options.stdlog.write("# %s" %
                                     ",".join(map(str, out_of_frame_columns)))

        mask_chars = (string.upper(options.mask_char),
                      string.lower(options.mask_char))

        to_delete = []

        ignore_case = exons or options.ignore_case

        for id in identifiers:

            ngaps, nmasked = 0, 0

            sequence = mali.getSequence(id).mString

            if options.loglevel >= 7:
                options.stdlog.write(
                    "# processing sequence %s of length %i with gaps\n" %
                    (id, len(sequence)))

            ## treat masters differently if they are only to be masked, not
            ## pruned.
            ## simple mask all characters that are to skipped
            fragments = []
            nstops, ncodons, naligned = 0, 0, 0

            codon = []
            chars = []

            is_master = id in masters

            for x in range(len(sequence)):
                c = sequence[x]

                ## delete columns that do not align to
                ## a master.
                if x not in frame_set and x not in out_of_frame_set:
                    continue

                chars.append(c)
                if c not in options.gap_chars:
                    codon.append(c)
                if len(codon) % 3 == 0:
                    codon = "".join(codon)
                    codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon(
                        codon, options)

                    if codon_is_aligned: naligned += 1

                    to_mask = False
                    if codon_is_all_gaps:
                        ngaps += len(chars)
                    elif codon_is_ok:
                        ncodons += 1
                        if string.upper(codon) in ("TAG", "TAA", "TGA"):
                            nstops += 1
                            to_mask = True
                    else:
                        to_mask = True
                        nmasked += 1

                    if to_mask:
                        for i in range(len(chars)):
                            if chars[i] not in options.gap_chars:
                                chars[i] = options.mask_char

                    fragments.append("".join(chars))
                    chars = []
                    codon = []

            ## mask incomplete codons at the end
            if chars:
                for i in range(len(chars)):
                    if chars[i] not in options.gap_chars:
                        chars[i] = options.mask_char
                fragments.append("".join(chars))


##             else:

##                 for a,b,c in frame_columns:

##                     codon = sequence[a] + sequence[b] + sequence[c]

##                     codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options )

##                     if codon_is_aligned: naligned += 1

##                     if codon_is_all_gaps:
##                         fragments.append( options.gap_char * 3 )
##                         ngaps += 1
##                     elif codon_is_ok:
##                         ncodons += 1
##                         if string.upper(codon) in ("TAG", "TAA", "TGA"):
##                             if options.remove_stops:
##                                 fragments.append( options.gap_char * 3 )
##                             elif options.mask_stops:
##                                 fragments.append( options.mask_char * 3 )
##                             else:
##                                 fragments.append( codon )
##                             nstops += 1
##                         else:
##                             fragments.append( codon )
##                     else:
##                         fragments.append( options.gap_char * 3 )
##                         nmasked += 1

##                     if options.loglevel >= 7:
##                         options.stdlog.write("# %s: %i,%i,%i: codon=%s ok=%s is_aligned=%s\n" % (id,
##                                                                                                  a,b,c,
##                                                                                                  codon,
##                                                                                                  str(codon_is_ok),
##                                                                                                  str(codon_is_aligned) ))

            s = string.join(fragments, "")
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sequence: %s\tpositions: %i\taligned:%i\tcodons: %i\t stops: %i\tgaps: %i\tnmasked: %i\n"
                    % (id, len(fragments), naligned, ncodons, nstops, ngaps,
                       nmasked))
                options.stdlog.flush()

            ## postpone deletion in order to not
            ## confuse the iteration of ids
            if naligned == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned nucleotides.\n"
                    % id)
                to_delete.append(id)
            elif ncodons == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned codons.\n"
                    % id)
                to_delete.append(id)
            else:
                mali.setSequence(id, string.join(fragments, ""))

        for id in to_delete:
            del mali[id]

    for id in identifiers:
        if options.mark_codons:
            a = mali[id]
            f = lambda x: a[x:x + 3]
            s = string.join([f(x) for x in range(0, len(a), 3)], " ")
        else:
            s = mali[id]
        options.stdout.write(">%s\n%s\n" % (id, s))

    if options.filename_translation:
        outfile = open(options.filename_translation, "w")
        for id in mali.keys():
            outfile.write(">%s\n%s\n" %
                          (id, Genomics.TranslateDNA2Protein(mali[id])))
        outfile.close()

    E.Stop()
Exemplo n.º 29
0
def runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options):
    """setup codeml wrapper.

    Sets options and returns a wrapper.
    """

    ids = mali.getIdentifiers()

    ## setup codeml
    codeml_options = {}

    if options.seqtype == "codon":
        codeml_options["seqtype"] = "1"
    elif options.seqtype == "aa":
        codeml_options["seqtype"] = "2"
    elif options.seqtype == "trans":
        codeml_options["seqtype"] = "3"

    if options.clean_data:
        codeml_options["cleandata"] = options.clean_data

    if options.omega != None:
        codeml_options["omega"] = str(options.omega)

    if options.kappa != None:
        codeml_options["kappa"] = str(options.kappa)

    if options.fix_kappa:
        codeml_options["fix_kappa"] = "1"

    if options.fix_omega:
        codeml_options["fix_omega"] = "1"

    if options.codon_frequencies != None:
        c = options.codon_frequencies.upper()
        if c == "UNIFORM":
            a = "0"
        elif c == "F1X4":
            a = "1"
        elif c == "F3X4":
            a = "2"
        elif c == "F61":
            a = "3"
        else:
            a = options.codon_frequencies
        codeml_options["CodonFreq"] = a

    if options.paml_method != None:
        codeml_options["paml_method"] = str(options.method)

    if options.optimization_threshold != None:
        codeml_options["Small_Diff"] = str(options.optimization_threshold)

    ninput, noutput, nskipped = 0, 0, 0
    tstart = time.time()

    if pairs and (options.pairwise or has_non_overlaps):
        wrapper = WrapperCodeML.CodeMLPairwise()

        ## do pairwise run
        result = WrapperCodeML.CodeMLResultPairs()

        ntotal = (len(ids) * (len(ids) - 1)) / 2

        for x, y in pairs:
            m1 = mali.getSequence(ids[x])
            ninput += 1

            temp_mali = Mali.Mali()
            m2 = mali.getSequence(ids[y])

            temp_mali.addSequence(ids[x], m1.mFrom, m1.mTo, m1.mString)
            temp_mali.addSequence(ids[y], m2.mFrom, m2.mTo, m2.mString)

            ## remove empty columns and masked columns
            if options.clean_mali:
                temp_mali.mGapChars = temp_mali.mGapChars + ("n", "N")
                temp_mali.removeGaps(minimum_gaps=1, frame=3)

            if temp_mali.getWidth() < options.min_overlap:
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# pair %s-%s: not computed because only %i residues overlap\n"
                        % (mali.getEntry(ids[x]).mId, mali.getEntry(
                            ids[y]).mId, temp_mali.getWidth()))
                nskipped += 1
                continue

            sub_result = wrapper.Run(temp_mali,
                                     options=codeml_options,
                                     dump=options.dump)
            result.mPairs += sub_result.mPairs

            if options.loglevel >= 1 and ninput % options.report_step == 0:
                options.stdlog.write(
                    "# pairwise computation: %i/%i -> %i%% in %i seconds.\n" %
                    (ninput, ntotal, 100.0 * ninput / ntotal,
                     time.time() - tstart))
                options.stdlog.flush()

            noutput += printPairs(sub_result.mPairs, mali, map_new2old,
                                  options)

            options.stdout.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# pairwise computation: ninput=%i, noutput=%i, nskipped=%i\n"
                % (ninput, noutput, nskipped))
            options.stdlog.flush()

    else:
        wrapper = WrapperCodeML.CodeML()

        result = wrapper.Run(mali,
                             tree=tree,
                             options=codeml_options,
                             dump=options.dump)

        result_pairs = WrapperCodeML.CodeMLResultPairs()
        result_pairs.fromResult(result)
        noutput += printPairs(result_pairs.mPairs, mali, map_new2old, options)

        l = mali.getLength()
        if options.loglevel >= 1:
            options.stdlog.write("# input=%i, npairs=%i, noutput=%i\n" %
                                 (l, l *
                                  (l - 1) / 2, len(result_pairs.mPairs)))
Exemplo n.º 30
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: sequences2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm",
                               "phylip"),
                      help="input format of multiple alignment")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("plain", "fasta", "stockholm", "phylip"),
                      help="output format of multiple alignment")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("add", ),
                      help="""method to use to build multiple alignment.""")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one.")

    parser.add_option("-a",
                      "--alignment-method",
                      dest="alignment_method",
                      type="choice",
                      choices=("sw", "nw"),
                      help="alignment_method [%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        method=None,
        parameters="",
        gop=-10.0,
        gep=-1.0,
        alignment_method="sw",
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    iterator = FastaIterator.iterate(sys.stdin)

    if options.method == "add":

        mali = Mali.Mali()

        mali.readFromFile(open(options.parameters[0], "r"),
                          format=options.input_format)
        del options.parameters[0]

        old_length = mali.getLength()

        new_mali = convertMali2Mali(mali)

        if options.alignment_method == "sw":
            alignator = alignlib_lite.py_makeAlignatorFullDP(
                options.gop, options.gep)
        else:
            alignator = alignlib_lite.py_makeAlignatorFullDPGlobal(
                options.gop, options.gep)

        while 1:
            cur_record = iterator.next()
            if cur_record is None: break

            map_mali2seq = alignlib_lite.py_makeAlignataVector()

            sequence = alignlib_lite.py_makeSequence(cur_record.sequence)
            profile = alignlib_lite.py_makeProfileFromMali(new_mali)

            if options.loglevel >= 4:
                options.stdlog.write(profile.Write())

            alignator.Align(profile, sequence, map_mali2seq)

            if options.loglevel >= 3:
                options.stdlog.write(map_mali2seq.Write())

            ## add sequence to mali
            a = alignlib_lite.py_makeAlignatumFromString(cur_record.sequence)
            a.thisown = 0

            new_mali.addAlignatum(a, map_mali2seq, 1, 1, 1, 1, 1)

            id = cur_record.title
            mali.mIdentifiers.append(id)
            mali.mMali[id] = Mali.AlignedString(
                id, 0, len(cur_record.sequence),
                new_mali.getRow(new_mali.getWidth() - 1).getString())

        # substitute
        for x in range(old_length):
            mali.mMali[mali.mIdentifiers[x]].mString = new_mali.getRow(
                x).getString()

        mali.writeToFile(sys.stdout, format=options.output_format)

    E.Stop()
Exemplo n.º 31
0
        xrate_min_increment=0.000001,
        with_rho=True,
        separator="|",
        single_omega=False,
        shared_frequencies=False,
        shared_rates=False,
        block_size=None,
        replicates=None,
    )

    (options, args) = Experiment.Start(parser)

    if options.replicates != None:
        # read a sequence collection with possible duplicate names
        # used for benchmarking
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    mali.readFromFile(sys.stdin, format=options.input_format)

    options.stdout.write(
        "seq1\tseq2\tdN\tdS\tdNdS\tN\tS\tdN_err\tdS_err\tkappa\tlnL\ttau\tlen")

    if options.with_rho:
        options.stdout.write("\trN\trS\tt\trN0\trS0\tt0")

    options.stdout.write("\terror_str\n")

    if options.replicates != None:
        ids = mali.getIdentifiers()
Exemplo n.º 32
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-o", "--gop", dest="gop", type="float",
                      help="gap opening penalty [default=%default].")

    parser.add_option("-e", "--gep", dest="gep", type="float",
                      help="gap extension penalty [default=%default].")

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices=("global", "local"),
                      help="alignment mode, global=nw, local=sw [default=%default].")

    parser.set_defaults(
        gop=-12.0,
        gep=-2.0,
        format="fasta",
        mode="local",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply two multiple alignments in FASTA format.")

    mali1 = Mali.Mali()
    mali2 = Mali.Mali()

    E.info("read 2 multiple alignments")

    mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format)
    mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format)

    cmali1 = Mali.convertMali2Alignlib(mali1)
    cmali2 = Mali.convertMali2Alignlib(mali2)

    if options.mode == "local":
        mode = alignlib_lite.py_ALIGNMENT_LOCAL
    elif options.mode == "global":
        mode = alignlib_lite.py_ALIGNMENT_GLOBAL

    alignator = alignlib_lite.py_makeAlignatorDPFull(mode,
                                                     options.gop, options.gep)

    alignlib_lite.py_setDefaultEncoder(
        alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20))
    alignlib_lite.py_setDefaultLogOddor(
        alignlib_lite.py_makeLogOddorDirichlet(0.3))
    alignlib_lite.py_setDefaultRegularizor(
        alignlib_lite.py_makeRegularizorDirichletPrecomputed())

    cprofile1 = alignlib_lite.py_makeProfile(cmali1)
    cprofile2 = alignlib_lite.py_makeProfile(cmali2)

    result = alignlib_lite.py_makeAlignmentVector()

    alignator.align(result, cprofile1, cprofile2)

    E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result))

    cmali1.add(cmali2, result)

    outmali = Mali.convertAlignlib2Mali(cmali1,
                                        identifiers=mali1.getIdentifiers() + mali2.getIdentifiers())

    outmali.writeToFile(options.stdout, format=options.format)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 33
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_sites_slr.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("summary-slr", "summary-filtered",
                               "over-representation", "positive-site-table",
                               "negative-site-table", "neutral-site-table",
                               "positive-site-list", "negative-site-list",
                               "neutral-site-list"),
                      help="method to apply.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix for rows.")

    parser.add_option("-s",
                      "--filename-sites",
                      dest="filename_sites",
                      type="string",
                      help="filename with sites information.")

    parser.add_option("-l",
                      "--filename-log",
                      dest="filename_log",
                      type="string",
                      help="filename with logging information.")

    parser.add_option(
        "-m",
        "--filename-mali",
        dest="filename_mali",
        type="string",
        help=
        "filename of multiple alignment, that was input to SLR. If given, is used to filter indels."
    )

    parser.add_option(
        "--filter-probability",
        dest="filter_probability",
        type="float",
        help="threshold for probability above which to include positive sites."
    )

    parser.add_option("--no-header",
                      dest="write_header",
                      action="store_false",
                      help="only output header.")

    parser.add_option("--only-header",
                      dest="only_header",
                      action="store_true",
                      help="only output header.")

    parser.add_option("--significance-threshold",
                      dest="significance_threshold",
                      type="float",
                      help="threshold for significance tests [%default].")

    parser.add_option("--use-adjusted",
                      dest="use_adjusted",
                      action="store_true",
                      help="use SLR adjusted probability values.")

    parser.add_option("--truncate-sites-list",
                      dest="truncate_sites_list",
                      type="int",
                      help="truncate sites list after ## entries (0 for all).")

    parser.add_option(
        "--context-size",
        dest="context_size",
        type="int",
        help="size of left/right context around a selected residue.")

    parser.set_defaults(
        prefix=None,
        filter_probability=0,
        filter_omega=0,
        filename_sites="-",
        filename_log=None,
        filename_mali=None,
        significance_threshold=0.05,
        write_header=True,
        only_header=False,
        use_adjusted=False,
        context_size=0,
        truncate_sites_list=0,
    )

    (options, args) = E.Start(parser)

    slr = WrapperSlr.Slr()

    # write headers
    if "%s" in options.filename_sites:
        options.prefix = True

    if options.method == "summary-slr":

        # write header
        if options.write_header or options.only_header:

            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Numbers of positive/neutral/negative sites according to SLR
#
# This uses the thresholds as set in SLR. Use "counts" for filtering
# residues based on your own thresholds
""")
            thresholds = "95%", "99%", "95% corrected", "99% corrected"

            if options.prefix:
                options.stdout.write("prefix\t")
            options.stdout.write(
                "ltree\tomega\tkappa\tlnL\tnsites\tnsyn\tngap\t")
            options.stdout.write("\t".join(
                map(lambda x: "npos_" + x.replace(" ", "_"), thresholds)))
            options.stdout.write("\t")
            options.stdout.write("\t".join(
                map(lambda x: "nneg_" + x.replace(" ", "_"), thresholds)))
            options.stdout.write("\n")

    elif options.method == "summary-filtered":

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Numbers of positive/neutral/negative sites according to SLR
#
# This method uses the supplied threshold and the multiple alignment to filter.
# All positions that are above the threshold (P-Value) and which are located in
# indels: >= 1 sequence missing from column, are removed.
""")

            if options.prefix:
                options.stdout.write("prefix\t")
            options.stdout.write(
                "ltree\tomega\tkappa\tlnL\tnsites\tnfiltered\tntotal\tnsyn\tnneg\tnpos\n"
            )

    elif options.method in ("positive-site-table", "negative-site-table",
                            "neutral-site-table"):

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Numbers of positive/neutral/negative sites according to SLR
#
# Note: sequence positions are 1-based, but mali positions are 0-based.
# Residues in indel positions have been removed and signifnicance was determined according
# with a threshold of %5.2e
""" % options.significance_threshold)

            if options.prefix:
                options.stdout.write("prefix\t")
            options.stdout.write("cluster\tnsites\tp-value\tsites\n")

    elif options.method in ("positive-site-list", "negative-site-list",
                            "neutral-site-list"):

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Sites under positive/neutral/negative selection according to SLR
#
# Note: sequence positions are 1-based, but mali positions are 0-based.
# Residues in indel positions have been removed and signifnicance was determined according
# with a threshold of %5.2e
""" % options.significance_threshold)

            if options.prefix:
                options.stdout.write("prefix\t")

            options.stdout.write(
                "sequence\tn\taa\tseq_pos\tmali_pos\tcontext\n")

    elif options.method == "over-representation":

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write("""# Genes with over-represented sites.
#
# This method uses as input the output of summary-filtered.
""")

    if options.only_header:
        sys.exit(0)

    if options.method in ("summary-slr", "summary-filtered",
                          "positive-site-table", "negative-site-table",
                          "neutral-site-table", "positive-site-list",
                          "negative-site-list", "neutral-site-list"):

        ninput, noutput, nskipped = 0, 0, 0

        if "%s" in options.filename_sites:

            headers, table = CSV.ReadTable(sys.stdin)

            fprefix = headers.index("prefix")

            try:
                fsignificance = headers.index("p")
            except ValueError:
                fsignificance = None

            for row in table:

                id = row[fprefix]
                if fsignificance is not None:
                    p_value = row[fsignificance]
                else:
                    p_value = None

                ninput += 1

                fn = re.sub("%s", id, options.filename_sites)
                if not os.path.exists(fn):
                    nskipped += 1
                    continue

                lines_sites = open(fn, "r").readlines()
                if options.filename_log:
                    lines_log = open(re.sub("%s", id, options.filename_log),
                                     "r").readlines()

                result = slr.parseOutput(lines_sites, lines_log)

                if options.method in ("summary-filtered",
                                      "positive-site-table",
                                      "negative-site-table",
                                      "neutral-site-table"):
                    mali = Mali.Mali()
                    mali.readFromFile(
                        open(re.sub("%s", id, options.filename_mali), "r"))
                else:
                    mali = None

                ProcessResult(result,
                              options,
                              mali,
                              prefix=id,
                              p_value=p_value)
                noutput += 1
        else:
            if options.filename_sites == "-":
                lines_sites = sys.stdin.readlines()
            else:
                lines_sites = open(options.filename_sites, "r").readlines()

            ninput += 1
            if options.filename_log:
                lines_log = open(options.filename_log, "r").readlines()

            result = slr.parseOutput(lines_sites, lines_log)

            if options.filename_mali:
                mali = Mali.Mali()
                mali.readFromFile(open(options.filename_mali, "r"))
            else:
                if options.method == "summary-filtered":
                    raise "please supply a multiple alignment for filtering."

                mali = None

            ProcessResult(result, options, mali, prefix=options.prefix)
            noutput += 1

        if options.loglevel >= 1:
            options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" %
                                 (ninput, noutput, nskipped))

    else:
        if options.method == "over-representation":

            results = []
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                data = line[:-1].split("\t")
                if data[0] == "prefix":
                    continue

                results.append(
                    Result(data[0], int(data[6]), int(data[7]), int(data[8]),
                           int(data[9]), int(data[10])))

            # probability of a single site being positive
            ntotal = sum(map(lambda x: x.mNTotal, results))
            npositives = sum(map(lambda x: x.mNPositive, results))
            p = float(npositives) / float(ntotal)

            if options.loglevel >= 1:
                options.stdlog.write("# sites: total=%i, positive=%i, p=%f\n" %
                                     (ntotal, npositives, p))

            new_results = []
            for result in results:
                if result.mNTotal == 0:
                    continue

                # use -1, because I need P( x >= X)
                # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X )
                # = P (x > X ).
                r = scipy.stats.binom.sf(result.mNPositive - 1, result.mNTotal,
                                         p)

                result.mSignificance = r

                if r < options.significance_threshold:
                    new_results.append(result)

            new_results.sort(
                lambda x, y: cmp(x.mSignificance, y.mSignificance))

            options.stdlog.write(Result().getHeader() + "\n")

            for result in new_results:
                options.stdout.write(str(result) + "\n")

            if options.loglevel >= 1:
                options.stdlog.write("# ntotal=%i, npos=%i\n" %
                                     (len(results), len(new_results)))

    E.Stop()
Exemplo n.º 34
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2malis.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--pattern-mali",
                      dest="pattern_mali",
                      type="string",
                      help="filename pattern for multiple alignment files.")

    parser.add_option(
        "--filename-coordinates",
        dest="filename_coordinates",
        type="string",
        help="filename of coordinates that constitute the multiple alignment.")

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal"),
                      help="input format of multiple alignment")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("fasta", "codeml", "phylip"),
                      help="output format of multiple alignment")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        filename_coordinates=None,
        pattern_mali="%s.fasta",
    )

    (options, args) = E.Start(parser)

    ## read coordinates
    if options.filename_coordinates:
        coordinates = []
        for line in open(options.filename_coordinates, "r"):
            if line[0] == "#": continue
            id, length, position = line[:-1].split("\t")
            if id == "component": continue
            coordinates.append((id, int(length), int(position)))

    mali = Mali.Mali()
    mali.readFromFile(sys.stdin, format=options.input_format)

    ids = mali.getIdentifiers()

    ninput, noutput = 0, 0

    for id, length, position in coordinates:

        ninput += 1
        part_mali = Mali.Mali()

        for x in ids:
            part_mali.addSequence(x, 0, length,
                                  mali[x][position:position + length])

        outfile_name = options.pattern_mali % id

        outfile = open(outfile_name, "w")

        part_mali.writeToFile(outfile, format=options.output_format)

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# input=%i, output=%i\n" % (ninput, noutput))

    E.Stop()