Exemplo n.º 1
0
    def run(self, grammar, tree=None, dump=0, test=False, options={}):

        self.mTempdir = tempfile.mkdtemp()
        self.mFilenameGrammar = "grammar.eg"
        self.mFilenameTree = "tree.nh"
        self.mFilenameOutput = None
        self.mWarnings = []

        if test:
            print "# temporary directory is %s" % self.mTempdir

        outfile = open(self.mTempdir + "/" + self.mFilenameGrammar, "w")
        outfile.write(grammar.getGrammar())
        outfile.close()

        if tree:

            outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w")

            ## check what kind of tree is given.
            if type(tree) == StringType:
                t = tree.strip()
                if t[0] == "(" and t[-1] in ");":
                    outfile.write("%s\n" % t)

                else:
                    nexus = TreeTools.Newick2Nexus(open(tree, "r"))
                    t = nexus.trees[0]
                    outfile.write("%s\n" % TreeTools.Tree2Newick(t))

            outfile.close()

        # use your own random seed. Time won't do, if simgram
        # is called in quick succession.
        # Are there any restrictions on seeds? Ian using an even number.
        statement = "%s -rndseed %i -g %s -t %s" % (
            self.mExecutable, random.randint(
                0, 4294967296), self.mFilenameGrammar, self.mFilenameTree)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             cwd=self.mTempdir,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise UsageError, "Error in running %s \n%s\n%s\nTemporary directory in %s" % (
                self.mExecutable, err, out, self.mTempdir)

        if dump:
            print "# stdout output of %s:\n%s\n######################################" % (
                self.mExecutable, out)

        if not test:
            shutil.rmtree(self.mTempdir)

        return self.parseOutput(out.split("\n"))
Exemplo n.º 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2taxa.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--skip-trees",
        dest="skip_trees",
        action="store_true",
        help="do not output tree names in third field [default=%default].")

    parser.set_defaults(skip_trees=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    nexus = TreeTools.Newick2Nexus(sys.stdin)
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from stdin.\n" %
                             len(nexus.trees))

    ntree = 0
    ntotal = len(nexus.trees)

    if ntotal == 1:
        options.stdout.write("taxon\n")
    else:
        if options.skip_trees:
            options.stdout.write("taxon\ttree\n")
        else:
            options.stdout.write("taxon\ttree\tname\n")

    for tree in nexus.trees:
        ntree += 1
        taxa = TreeTools.GetTaxa(tree)

        if ntotal == 1:
            for t in taxa:
                options.stdout.write("%s\n" % (t))
        elif options.skip_trees:
            for t in taxa:
                options.stdout.write("%s\t%i\n" % (t, ntree))
        else:
            for t in taxa:
                options.stdout.write("%s\t%i\t%s\n" % (t, ntree, tree.name))

    if options.loglevel >= 1:
        options.stdlog.write("# ntotal=%i\n" % (ntotal))

    E.Stop()
Exemplo n.º 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/evaluate_trees.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-r",
                      "--reference=",
                      dest="filename_reference_tree",
                      help="filename with reference tree.",
                      type="string")

    parser.set_defaults(filename_reference_tree=None)

    (options, args) = E.Start(parser)

    if not options.filename_reference_tree:
        print "please supply reference tree."

    if options.loglevel >= 1:
        print "# reading reference tree."

    nexus = TreeTools.Newick2Nexus(open(options.filename_reference_tree, "r"))
    reference_tree = nexus.trees[0]

    if options.loglevel >= 1:
        print "# reading sample trees."

    nexus2 = TreeTools.Newick2Nexus(sys.stdin)

    ntotal, nok, nfailed = 0, 0, 0
    ntopology, ntaxa, nleaves = 0, 0, 0
    for t in nexus2.trees:
        ntotal += 1
        is_ok, reason = TreeTools.IsCompatible(reference_tree, t)
        if is_ok:
            nok += 1
        else:
            nfailed += 1
            if reason == "topology":
                ntopology += 1
            elif reason == "taxa":
                ntaxa += 1
            elif reason == "leaves":
                nleaves += 1

    print "# total=%i, compatible=%i, failed=%i, topology=%i, taxa=%i, leaves=%i" %\
          (ntotal, nok, nfailed, ntopology, ntaxa, nleaves)

    E.Stop()
Exemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2patterns.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-s",
                      "--sort-order",
                      dest="sort_order",
                      type="string",
                      help="output order of OTU.")

    parser.set_defaults(
        reference_tree=None,
        sort_order=[],
    )

    (options, args) = E.Start(parser)

    if not options.sort_order:
        for nx in reference_tree.get_terminals():
            options.sort_order.append(reference_tree.node(nx).get_data().taxon)
    else:
        options.sort_order = options.sort_order.split(",")

    if not options.reference_tree:
        raise "no reference tree defined."

    nexus = TreeTools.Newick2Nexus(options.reference_tree)
    reference_tree = nexus.trees[0]

    if options.loglevel >= 3:
        print "# reference tree:"
        print reference_tree.display()

    patterns = TreeTools.calculatePatternsFromTree(tree, options.sort_order)

    for p in patterns:
        print p

    E.Stop()
Exemplo n.º 5
0
    def WriteTree(self, tree):
        """write tree to file.
        """

        nexus = TreeTools.Newick2Nexus(tree)
        t = nexus.trees[0]
        TreeTools.MapTaxa(t, self.mMapOld2New)

        outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w")
        outfile.write("%i 1\n" % self.mNumSequences)
        outfile.write("%s\n" % TreeTools.Tree2Newick(t))
        outfile.close()
Exemplo n.º 6
0
    def processChunk(lines, map_strain2species, options):

        nexus = TreeTools.Newick2Nexus(lines)
        global ninput, noutput, nskipped, nmerged

        for tree in nexus.trees:
            ninput += 1

            if options.loglevel >= 3:
                tree.display()

            mergers = getSpeciesTreeMergers(tree, map_strain2species, options)

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# found %i nodes in the tree that will be merged.\n" % (len(mergers)))

            if len(mergers) > 0:
                nmerged += 1

            n = applySpeciesTreeMergers(
                tree, mergers, map_strain2species, options)

            if len(tree.get_terminals()) <= 1:
                nskipped += 1
                continue

            tree.writeToFile(options.stdout, format=options.output_format)
            noutput += 1
Exemplo n.º 7
0
def trainMali( mali, options ):
    """train a grammar on a multiple alignment."""

    ## remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps( minimum_gaps = 1, frame=1 )
    
    length = mali.getNumColumns()

    input_model = prepareGrammar( options )

    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename( id, species )

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True )
    
    ids = mali.getIdentifiers()

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") )
        tree = nexus.trees[0]
        try:
            tree.relabel( map_old2new, warn = True )
        except KeyError, msg:
            raise KeyError( "names in mali and tree are not congruent: %s" % msg )
Exemplo n.º 8
0
    def plotLines(self):
        """plot lines of the tree"""

        # plot tree in dfs manner
        def plotLines(node_id):

            node = self.mTree.node(node_id)

            left = self.mNodeWidthsStart[node_id]
            right = self.mNodeWidthsEnd[node_id]
            height = self.mNodeHeights[node_id]

            if right != left and node_id != self.mTree.root:
                self.addElements(
                    self.mDecoratorHorizontalBranches.getElements(
                        node_id,
                        self.getHeaderWidth() + left,
                        self.getHeaderWidth() + right,
                        self.getHeaderHeight() + height))

            for s in node.succ:

                new_height = self.mNodeHeights[s]
                self.addElements(
                    self.mDecoratorVerticalBranches.getElements(
                        node_id,
                        self.getHeaderWidth() + right,
                        self.getHeaderHeight() + height,
                        self.getHeaderHeight() + new_height))

        TreeTools.TreeDFS(self.mTree, self.mTree.root, pre_function=plotLines)
Exemplo n.º 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree2plot.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.set_defaults()

    (options, args) = E.Start(parser, add_pipe_options=True)

    lines = filter(lambda x: x[0] != "#", sys.stdin.readlines())

    nexus = TreeTools.Newick2Nexus(lines)

    input_tree = nexus.trees[0]

    treegraph = TreeGraph(support=None, loglevel=options.loglevel)

    print treegraph.Run(input_tree)

    E.Stop()
Exemplo n.º 10
0
def TranslateNode(node, tree, terminals, options):
    if options.do_translate:
        return options.separator.join(sorted(TreeTools.GetLeaves(tree, node)))
    elif node in terminals:
        return tree.node(node).data.taxon
    else:
        return str(node)
Exemplo n.º 11
0
def rerootTree(gene_tree, extract_species, options):

    otus = TreeTools.GetTaxa(gene_tree)

    # find monophyletic trees of outgroup_species
    try:
        outgroup_taxa = filter(
            lambda x: extract_species(x) in options.outgroup_species, otus)
    except AttributeError:
        raise "error while rerooting tree in tree %s with %s" % (
            gene_tree.name, str(otus))

    if gene_tree.is_monophyletic(outgroup_taxa):
        r = outgroup_taxa
    else:
        r = [outgroup_taxa[0], ]

    if r:
        if options.loglevel >= 1:
            options.stdlog.write("# tree %s: rerooting with %i outgroups:  %s.\n" % (
                gene_tree.name, len(r), ",".join(r)))
            options.stdlog.flush()
    else:
        if options.loglevel >= 1:
            options.stdlog.write(
                "# tree %s: no outgroup found, tree will not be rerooted.\n" % gene_tree.name)
            options.stdlog.flush()

    gene_tree.root_with_outgroup(r)

    if options.loglevel >= 5:
        gene_tree.display()
Exemplo n.º 12
0
    def parseOutput(self, lines, out, err):

        lines = re.sub("\s", "", "".join(lines))
        lines = re.sub("\[[^\]]+\]", "", lines)

        t = TreeTools.Newick2Nexus("".join(lines))

        result = Result()
        t = t.trees[0]

        TreeTools.MapTaxa(t, self.mMapNew2Old)

        result.mTree = t

        result.mLog = out
        result.mErr = err

        return result
Exemplo n.º 13
0
def filterTree(tree, options, map_id2location=None):
    """apply location and type filter to tree.

    if outgroups are defined, they are not removed.
    """

    otus = TreeTools.GetTaxa(tree)

    to_remove = set()
    if options.remove_unplaced:
        tt = set()
        for id in otus:
            if id not in map_id2location:
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: unknown location for id %s.\n" % id)
                continue

            if map_id2location[id].mShortName.lower() in MAP_CONTIG2JUNK:
                to_remove.add(id)
                tt.add(id)

        if options.loglevel >= 3:
            options.stdlog.write("# tree %s: removing %i entries because of location: %s\n" %
                                 (tree.name, len(tt), ";".join(tt)))

    new_otus = list(set(otus).difference(to_remove))

    if len(new_otus) != len(otus):

        TreeTools.PruneTree(tree, new_otus, keep_distance_to_root=True)

    if options.loglevel >= 1:
        options.stdlog.write("# tree %s: filtering: before=%i, remove=%i, after=%i, final=%i\n" %
                             (tree.name, len(otus), len(to_remove), len(new_otus), len(TreeTools.GetTaxa(tree))))
        options.stdlog.flush()
Exemplo n.º 14
0
def getBestTree(trees, method="select-largest"):
    """select best tree out of a set of trees."""

    if method == "select-largest":
        sizes = zip(map(lambda x: len(x.get_taxa()), trees), range(len(trees)))
        sizes.sort()

        best_tree = sizes[-1][1]
        if options.loglevel >= 3:
            for x in range(len(trees)):
                if x == best_tree:
                    continue
                options.stdlog.write(
                    "# skipped tree: %s: %s\n" % (trees[x].name, TreeTools.Tree2Newick(trees[x])))

        return trees[best_tree]
Exemplo n.º 15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: tree2patterns.py 2781 2009-09-10 11:33:14Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string",
                      help="reference tree to read.")

    parser.add_option("-s", "--sort-order", dest="sort_order", type="string",
                      help="output order of OTU.")

    parser.set_defaults(
        reference_tree=None,
        sort_order=[],
    )

    (options, args) = E.Start(parser)

    if not options.sort_order:
        for nx in reference_tree.get_terminals():
            options.sort_order.append(reference_tree.node(nx).get_data().taxon)
    else:
        options.sort_order = options.sort_order.split(",")

    if not options.reference_tree:
        raise "no reference tree defined."

    nexus = TreeTools.Newick2Nexus(options.reference_tree)
    reference_tree = nexus.trees[0]

    if options.loglevel >= 3:
        print "# reference tree:"
        print reference_tree.display()

    patterns = TreeTools.calculatePatternsFromTree(tree, options.sort_order)

    for p in patterns:
        print p

    E.Stop()
Exemplo n.º 16
0
def ParseTree(reference_tree, rx_species):

    nexus = TreeTools.Newick2Nexus(reference_tree)
    reference_tree = nexus.trees[0]
    if param_loglevel >= 3:
        print "# reference tree:"
        reference_tree.display()

    map_taxon2id = {}
    for nx in reference_tree.get_terminals():
        otu = reference_tree.node(nx).get_data().taxon
        map_taxon2id[otu] = len(map_taxon2id)
        if param_loglevel >= 2:
            print "# %s\t%i" % (otu, map_taxon2id[otu])
    map_taxon2id["unknown"] = len(map_taxon2id)

    return reference_tree, map_taxon2id
Exemplo n.º 17
0
    def testGetMergers(self):
        """
        test.

        TODO: add testing for transcripts
        """
        print "testGetMergers()"

        for lines, reference, map_strain2species, options in self.mTestData:
            nexus = TreeTools.Newick2Nexus(lines)
            mergers = tree_strain2species.getMergers(
                nexus.trees[0], map_strain2species, options)
            for node_id, species, strain_x, gene_x, strain_y, gene_y in mergers:
                key1 = ((strain_x, gene_x), (strain_y, gene_y))
                key2 = ((strain_y, gene_y), (strain_x, gene_x))
                if key1 not in reference and key2 not in reference:
                    self.fail("%s not in reference %s" %
                              (str(key1), str(reference)))
Exemplo n.º 18
0
    def processChunk(lines, map_strain2species, options):

        nexus = TreeTools.Newick2Nexus(lines)
        global ninput, noutput, nskipped, nmerged

        for tree in nexus.trees:
            ninput += 1

            if options.loglevel >= 3:
                tree.display()

            mergers = getMergers(tree, map_strain2species, options)

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# found %i pairs of genes that will be merged.\n" %
                    (len(mergers)))

            if len(mergers) > 0:
                nmerged += 1

            n = applyMergers(tree, mergers, counters, map_strain2species,
                             options)

            if len(tree.get_terminals()) <= 1:
                nskipped += 1
                continue

            for new_name, values in n.items():
                for strain, gene in values:
                    if (strain, gene) in merged:
                        options.stdlog.write(
                            "# warning: strain %s and gene %s already appeared in tree %s"
                            % (merged[(strain, gene)]))
                        nwarnings += 1
                    merged[(strain, gene)] = None
                    output_genes.write("%s\t%s\n" % (options.separator.join(
                        (strain, gene)), new_name))

            tree.writeToFile(options.stdout, format=options.output_format)
            noutput += 1
Exemplo n.º 19
0
def GetPrunedReferenceTree( mask, present_orgs, reference_tree ):

    # reread and process species tree
    # has to be done for every new pass, because
    # the tree is modified later on (and I haven't found
    # a copy mechanism (because I did not look)).
    nexus = TreeTools.Newick2Nexus( reference_tree )
    reference_tree = nexus.trees[0]

    ###########################################################################
    # prune reference tree and keep only those taxa, which are present in the cluster.
    for nx in reference_tree.get_terminals():
        otu = reference_tree.node(nx).get_data().taxon
        if otu not in present_orgs:
            Prune( reference_tree, otu )
        
    if param_loglevel >= 3:
        print "# pruned reference tree for %s:" % (",".join(present_orgs.keys()))
        reference_tree.display()

    return reference_tree
Exemplo n.º 20
0
        sort_order = [],
        )

    (options, args) = E.Start( parser )

    if not options.sort_order:
        for nx in reference_tree.get_terminals():
            options.sort_order.append( reference_tree.node(nx).get_data().taxon )
    else:
        options.sort_order = options.sort_order.split(",")

    if not options.reference_tree:
        raise "no reference tree defined."

    nexus = TreeTools.Newick2Nexus( options.reference_tree )
    reference_tree = nexus.trees[0]
    
    if options.loglevel >= 3:
        print "# reference tree:"
        print reference_tree.display()

    patterns = TreeTools.calculatePatternsFromTree( tree, options.sort_order )

    for p in patterns:
        print p

    E.Stop()



Exemplo n.º 21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_duplications.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use.")

    parser.add_option("-p",
                      "--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to use for temporary files.")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="string",
                      help="method to use [counts|lists|hists|links].")

    parser.add_option("-o",
                      "--filename-output",
                      dest="filename_output",
                      type="string",
                      help="output filename.")

    parser.add_option("-f",
                      "--functions",
                      dest="functions",
                      type="string",
                      help="functions to grep [functional|pseudo|all].")

    parser.add_option("-l",
                      "--locations",
                      dest="locations",
                      type="string",
                      help="locations to grep [local|nojunk|all|...].")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")

    parser.add_option("-i",
                      "--fit",
                      dest="fit",
                      type="string",
                      help="fitting method [decay|power]")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.add_option("--use-relative-height",
                      dest="use_relative_height",
                      action="store_true",
                      help="use relative height values.")

    parser.add_option(
        "--reverse",
        dest="reverse",
        action="store_true",
        help="""reverse species. Histograms will show the age of duplications for
                      duplicates in other genomes.""")

    parser.set_defaults(species="",
                        functions="functional,pseudo,all",
                        locations="local,nojunk,all",
                        filename_output=None,
                        bin_size=1.0,
                        min_value=None,
                        max_value=None,
                        nonnull=None,
                        use_relative_height=False,
                        header=True,
                        fit=None,
                        reverse=False,
                        method="counts")

    (options, args) = E.Start(parser, add_psql_options=True)

    options.species = options.species.split(",")
    options.locations = options.locations.split(",")
    options.functions = options.functions.split(",")

    if len(options.species) == 0:
        raise "please supply list of species."

    dbhandle = pgdb.connect(options.psql_connection)

    input_data = map(lambda x: x[:-1].split("\t"),
                     filter(lambda x: x[0] != "#", sys.stdin.readlines()))

    ## remove header
    if options.header:
        del input_data[0]

    ## decide which columns to take
    ## 1st column: species1: this is the species in which duplications have occured.
    ## 2nd column: species2: this is the species with respect to which duplications occured.
    ## 3rd column: clusterid
    ## 4th column: chromosomes
    ## 5th column: function
    ## 6th column: height
    ## 7th column: relative height
    ## 8th column: locations
    ## 9th column: tree
    if options.use_relative_height:
        take = (0, 1, 2, 3, 4, 6, 7, 8)
    else:
        take = (0, 1, 2, 3, 4, 5, 7, 8)

    for x in range(len(input_data)):
        input_data[x] = tuple([input_data[x][y] for y in take])

    map_pos2species = []
    map_species2pos = {}
    for x in range(len(options.species)):
        map_species2pos[options.species[x]] = x
        map_pos2species.append(options.species[x])

    outfile = None

    if options.method in ("counts", "medians"):

        if options.method == "counts":
            func = len
        elif options.method == "medians":
            func = numpy.median

        for location in options.locations:

            for function in options.functions:
                matrix = numpy.zeros(
                    (len(options.species), len(options.species)), numpy.Float)

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None
                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        if len(values) > 0:
                            matrix[map_species2pos[last_species1],
                                   map_species2pos[last_species2]] = func(
                                       values)

                        values = []
                        last_species1 = species1
                        last_species2 = species2

                    values.append(float(height))

                if len(values) > 0:
                    matrix[map_species2pos[last_species1],
                           map_species2pos[last_species2]] = func(values)

                if options.filename_output:
                    dict = {"f": function, "l": location}
                    outfile = open(options.filename_output % dict, "w")
                else:
                    outfile = sys.stdout
                    outfile.write(
                        "matrix for method %s: location: %s, function: %s\n" %
                        (options.method, location, function))

                if options.method == "medians":
                    format = "%6.4f"
                elif options.method == "counts":
                    format = "%i"
                MatlabTools.WriteMatrix(matrix,
                                        outfile=outfile,
                                        format=format,
                                        row_headers=options.species,
                                        col_headers=options.species)

                if options.filename_output:
                    outfile.close()

    elif options.method in ("lists", "lists-union"):
        ## write lists of duplicated genes in species1 as compared to species2
        ##      according to location/function
        ## First field : gene name
        ## Second field: cluster id
        ## Third field : number of other genes in cluster
        ## Fourth field: location of gene
        written = {}
        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None

                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if options.method == "lists":
                                if outfile: outfile.close()
                                dict = {
                                    "f": function,
                                    "l": location,
                                    "s": species1,
                                    "o": species2
                                }
                                written = {}
                                outfile = open(options.filename_output % dict,
                                               "w")
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    if outfile: outfile.close()
                                    dict = {
                                        "f": function,
                                        "l": location,
                                        "s": species1
                                    }
                                    written = {}
                                    outfile = open(
                                        options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            if options.method == "lists":
                                outfile.write(
                                    "location: %s, function: %s, species1: %s, species2: %s\n"
                                    % (location, function, species1, species2))
                                written = {}
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    outfile.write(
                                        "location: %s, function: %s, species1: %s\n"
                                        % (location, function, species1))
                                    written = {}

                        last_species1 = species1
                        last_species2 = species2

                    # get tree
                    tt = TreeTools.Newick2Tree(tree)
                    taxa = TreeTools.GetTaxa(tt)
                    for t in taxa:
                        if t in written: continue
                        outfile.write("%s\t%s\t%i\n" %
                                      (t, cluster_id, len(taxa)))
                        written[t] = 1

    elif options.method in ("hists", "fit-decay"):

        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                data.sort()

                ################################################################
                ## convert to matrix of list
                ## values[x][y] contains heights of duplications in species x with reference to y

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    try:
                        values[map_species2pos[species1]][
                            map_species2pos[species2]].append(float(height))
                    except KeyError:
                        continue

                ################################################################
                ################################################################
                ################################################################
                # calculate histograms per species
                ################################################################
                for s in options.species:
                    histograms = []
                    headers = []

                    if options.filename_output:
                        dict = {"f": function, "l": location, "s": s}
                        outfile = open(options.filename_output % dict, "w")
                    else:
                        outfile = sys.stdout
                        outfile.write("location: %s, function: %s\n" %
                                      (location, function))

                    for x in range(len(options.species)):

                        if options.reverse:
                            ## duplications in species x
                            vv = values[x][map_species2pos[s]]
                        else:
                            ## duplications in species s
                            vv = values[map_species2pos[s]][x]

                        if len(vv) == 0:
                            pass
                        else:
                            headers.append(options.species[x])
                            h = Histogram.Calculate(
                                vv,
                                increment=options.bin_size,
                                min_value=options.min_value,
                                max_value=options.max_value,
                                no_empty_bins=True)

                            if options.method == "fit-decay":
                                result = fit(h, [2.0, -1.0])
                                if result:
                                    outfile.write(
                                        "%s\t%s\t%s\t%i\t%f\t%f\ty = %f * exp ( %f * x )\n"
                                        % (
                                            "function",
                                            s,
                                            options.species[x],
                                            h[0][1],
                                            result[0],
                                            result[1],
                                            result[0],
                                            result[1],
                                        ))
                            elif options.method == "hists":
                                histograms.append(h)

                    if options.method == "hists":
                        combined_histogram = Histogram.Combine(
                            histograms, missing_value="-")

                        outfile.write("bin\t" + "\t".join(headers) + "\n")
                        Histogram.Write(outfile, combined_histogram)

                    if options.filename_output:
                        outfile.close()
                    else:
                        outfile.flush()

    elif options.method == "pairs":

        ## get branches with 0 branchlength

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                data.sort()
                last_species1, last_species2, last_cluster_id = None, None, None

                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if outfile: outfile.close()
                            dict = {
                                "f": function,
                                "l": location,
                                "s": species1,
                                "o": species2
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, species1, species2))

                        last_species1 = species1
                        last_species2 = species2
                        last_cluster_id = None

                    if last_cluster_id != cluster_id:
                        if last_cluster_id != None:
                            pass

                        last_cluster_id = cluster_id

                    outfile.write("%s\t%s\t%s\t%s\n" %
                                  (cluster_id, height, locations, tree))

    elif options.method == "links":

        ## write a tree for each species pair:
        ## each node is a gene+location, the weight of the vertex is the height
        ## further info added: cluster_id for the duplication

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                ## stores duplications within first species as compared to second species
                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    values[map_species2pos[species1]][
                        map_species2pos[species2]].append(
                            (cluster_id, -len(locations), locations, tree))

                # get links per species
                for s in options.species:
                    if options.loglevel >= 2:
                        options.stdlog.write("#     processing species %s\n" %
                                             s)

                    headers = []
                    for x in range(len(options.species)):

                        if map_pos2species[x] == s: continue

                        vv = values[map_species2pos[s]][x]
                        vv.sort()

                        ## write trees per cluster
                        if options.filename_output:
                            dict = {
                                "f": function,
                                "l": location,
                                "s": s,
                                "o": map_pos2species[x]
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, s, map_pos2species[x]))

                        ## only print out largest tree
                        last_cluster_id = None
                        for cluster_id, n, locations, tree in vv:
                            if cluster_id != last_cluster_id:
                                outfile.write("%s\t%s\t%s\n" %
                                              (cluster_id, locations, tree))
                                last_cluster_id = cluster_id

                        if options.filename_output:
                            outfile.close()

    E.Stop()
Exemplo n.º 22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: trees2sets.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-e",
                      "--enumeration",
                      dest="enumeration",
                      type="choice",
                      choices=("monophyletic", "full", "pairwise",
                               "exhaustive", "explicit", "lineage"),
                      help="enumeration of ortholog groups.")

    parser.add_option("-o",
                      "--organisms",
                      dest="column2org",
                      type="string",
                      help="sorted list of organisms.")

    parser.add_option("-p",
                      "--filename-patterns",
                      dest="filename_patterns",
                      type="string",
                      help="filename with patterns to output.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="filename with summary to output.")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("strict", "degenerate", "any", "outgroup",
                               "lineage"),
                      help="sets to extract.")

    parser.add_option("-s",
                      "--species-set",
                      dest="species_set",
                      type="string",
                      help="comma separated list of species.")

    parser.add_option("-g",
                      "--outgroups",
                      dest="outgroups",
                      type="string",
                      help="comma separated list of outgroup species.")

    parser.add_option(
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")

    parser.add_option(
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("--reroot",
                      dest="reroot",
                      type="choice",
                      choices=("outgroup", "midpoint"),
                      help="reroot trees before computing sets.")

    parser.set_defaults(
        reference_tree=None,
        enumeration="full",
        column2org=None,
        separator="|",
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_summary=None,
        methods=[],
        species_set=None,
        outgroups=None,
        reroot=None,
    )

    (options, args) = E.Start(parser)

    if len(options.methods) == 0:
        options.methods.append("strict")

    if options.species_set:
        options.species_set = options.species_set.split(",")
        options.enumeration = "explicit"

    #######################################################################
    # warning: outgroup method is useless, as it requires
    # only a single outgroup per tree and the tree rooted
    # with the outgroup.
    if "outgroup" in options.methods and not options.outgroups:
        raise "please supply --outgroups if method 'outgroup' is chosen."

    if options.outgroups:
        options.outgroups = options.outgroups.split(",")

    ########################################################################
    ########################################################################
    ########################################################################
    if options.reference_tree:
        if options.reference_tree[0] == "(":
            nexus = TreeTools.Newick2Nexus(options.reference_tree)
        else:
            nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r"))
        reference_tree = nexus.trees[0]

        if options.loglevel >= 3:
            options.stdlog.write("# reference tree:\n%s\n" %
                                 reference_tree.display())
    else:
        reference_tree = None
        raise ValueError("please supply a reference tree")

    ########################################################################
    ########################################################################
    ########################################################################
    # read all trees
    ########################################################################
    nexus = TreeTools.Newick2Nexus(sys.stdin)

    ########################################################################
    ########################################################################
    ########################################################################
    # sort out reference tree
    ########################################################################
    rs = re.compile(options.species_regex)
    rg = re.compile(options.gene_regex)
    extract_species = lambda x: parseIdentifier(x, options)[0]
    extract_gene = lambda x: parseIdentifier(x, options)[2]

    # prune reference tree to species present
    species_set = set()
    for tree in nexus.trees:
        try:
            species_set = species_set.union(
                set(map(extract_species, tree.get_taxa())))
        except AttributeError:
            raise "parsing error while extracting species from %s" % str(
                tree.get_taxa())

    TreeTools.PruneTree(reference_tree, species_set)

    if options.loglevel >= 1:
        options.stdlog.write("# reference tree after pruning has %i taxa.\n" %
                             len(reference_tree.get_taxa()))

    if options.column2org:
        options.column2org = options.column2org.split(",")
    elif reference_tree:
        options.column2org = []
        for nx in reference_tree.get_terminals():
            options.column2org.append(reference_tree.node(nx).get_data().taxon)

    options.org2column = {}
    for x in range(len(options.column2org)):
        options.org2column[options.column2org[x]] = x

    for method in options.methods:

        ###################################################################
        ###################################################################
        ###################################################################
        # print out a list of ortholog clusters
        ###################################################################
        writeOrthologSets(options.stdout,
                          nexus,
                          extract_species,
                          extract_gene,
                          options=options,
                          reference_tree=reference_tree,
                          method=method,
                          outgroups=options.outgroups)

    E.Stop()
Exemplo n.º 23
0
def writeOrthologSets(outfile,
                      nexus,
                      extract_species,
                      extract_gene,
                      options,
                      reference_tree=None,
                      method="strict",
                      outgroups=None):
    """output ortholog sets.

    A "strict" ortholog set contains exactly one gene for each species,
    while a "degenerate" ortholog set contains at least one gene for each
    species. 
    """

    ######################################################################
    # build species set to compare
    sets = []
    species = options.column2org
    nspecies = len(species)

    if options.enumeration == "monophyletic":
        if reference_tree:
            for members, h1, h2 in TreeTools.GetSubsets(reference_tree):
                if len(members) > 1:
                    sets.append(members)
        else:
            raise "please specify a species tree for monophyletic enumeration"

    elif options.enumeration == "exhaustive":
        for x in range(2, len(species)):
            sets += list(SetTools.xuniqueCombinations(species, x))
        sets.append(species)

    elif options.enumeration == "pairwise":

        for x in range(len(species) - 1):
            for y in range(x + 1, len(species)):
                sets.append((species[x], species[y]))

    elif options.enumeration == "full":
        sets.append(species)

    elif options.enumeration == "lineage":
        for s in species:
            sets.append((s, ))

    elif options.enumeration == "explicit":
        for x in range(2, len(options.species_set)):
            sets += list(SetTools.xuniqueCombinations(options.species_set, x))
        sets.append(options.species_set)

    ######################################################################
    # build sets with positional information
    xsets = []
    map_frozenset2set = {}
    for x in range(len(sets)):
        ss = frozenset(map(lambda x: options.org2column[x], sets[x]))
        xsets.append(ss)
        map_frozenset2set[ss] = x

    ######################################################################
    # collect outgroups
    if outgroups:
        noutgroups = set()
        for x in outgroups:
            noutgroups.add(options.org2column[x])
    else:
        noutgroups = None

    ######################################################################
    # loop over each tree and set
    # I did not see a way to loop a tree once for all sets without doing
    # complicated counting. The problem is that counting has to be stopped
    # at different tree heights for different sets.
    ninput, noutput, nempty, nskipped = 0, 0, 0, 0

    counts = [0] * len(sets)

    options.stdout.write(
        "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" %
        "\t".join(species))

    cluster_id = 0
    nerrors = 0

    for tree in nexus.trees:

        ninput += 1
        ntotal_tree = 0

        if options.loglevel >= 3:
            options.stdlog.write("# processing tree %s\n" % tree.name)

        if options.reroot:
            rerootTree(tree, extract_species, options)

        for c in range(len(xsets)):
            # numbered species set: 0,1,...
            sn = xsets[c]
            # literal species set: species1, species2, ...
            sl = sets[c]

            ortholog_nodes = getOrthologNodes(tree,
                                              sn,
                                              options,
                                              selector=method,
                                              outgroups=noutgroups)
            ntotal_tree += len(ortholog_nodes)

            n = 0

            pattern = buildPattern(nspecies, sn)

            # check for inconsistent partitions (the same gene in different
            # ortholog clusters) within the current tree
            found_genes = set()
            ortho_sets = set()

            # reverse ortholog_node - work in top-down manner.
            ortholog_nodes.reverse()

            for node_id, members in ortholog_nodes:
                n += 1
                cluster_id += 1

                otus = filter(lambda x: extract_species(x) in sl,
                              tree.get_taxa(node_id))
                genes = set(map(extract_gene, otus))

                if found_genes.intersection(genes):

                    # only take largest cluster for lineage specific
                    # duplications
                    if method == "lineage":
                        continue

                    if frozenset(genes) in ortho_sets:
                        nskipped += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: redundant node: %i - skipped because already present: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))
                    else:
                        nerrors += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))

                found_genes = found_genes.union(genes)
                ortho_sets.add(frozenset(genes))

                xpattern = buildPattern(nspecies, sn, members)

                options.stdout.write(
                    "%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" %
                    (len(sl), tree.name, n, cluster_id, "".join(pattern),
                     "\t".join(xpattern), node_id, ";".join(otus)))

            counts[c] += n

        if ntotal_tree == 0:
            nempty += 1
        else:
            noutput += 1

    if options.loglevel >= 1:
        options.stdout.write(
            "# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" %
            (ninput, nempty, noutput, nskipped, nerrors))

    # write summary information

    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = options.stdout
        outfile.write("//\n")

    outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species)))

    for c in range(len(xsets)):
        pattern = buildPattern(nspecies, xsets[c])
        outfile.write("%i\t%s\t%i\t%s\n" %
                      (c, "".join(pattern), counts[c], "\t".join(pattern)))

    if outfile != options.stdout:
        outfile.close()
Exemplo n.º 24
0
def getOrthologNodes(tree,
                     positive_set,
                     options,
                     selector="strict",
                     outgroups=None):
    """get all ortholog nodes in tree for species in positive_set.

    Depending on the selector function, different sets are returned:

    If selector is "strict", only strict orthologs are returned. These
    contain exactly one gene per species for all species in the positive_set.

    If selector is "degenerate", only degenerate orthologs are returned.
    These contain at least gene per species for species in the positive_set.

    Collect genes in tree for each species.

    Returns the node_id for which a set fulfills the criteria
    and the set for which it fulfills it.

    Avoid double counting: if you are interested in species A and B,
    any branches involving others species should be ignored. Make sure to only
    count once and not every time a discarded branch is removed.

    Thus, as soon as A and B merge, any node further up the tree have to be
    ignored.

    total_genes_function:   if true, node is recorded
    total_species_function: if true, iteration stops

    """

    nspecies = len(options.org2column)

    if selector == "strict":
        # strict orthologs: at most one gene per species
        exit_function = lambda num_genes_for_species: num_genes_for_species > 1
        keep_function = lambda num_genes_for_species: num_genes_for_species == 1
        total_genes_function = lambda num_genes_at_node, num_species_in_pattern: num_genes_at_node == num_species_in_pattern
        total_species_function = lambda num_species_at_node, num_species_in_pattern: num_species_at_node == num_species_in_pattern
        check_outgroup_function = lambda x: False
        negative_set = set()
    elif selector == "degenerate":
        # degenerate orthologs: any number of genes per species,
        exit_function = lambda num_genes_for_species: False
        keep_function = lambda num_genes_for_species: num_genes_for_species > 0
        total_genes_function = lambda num_genes_at_node, num_species_in_pattern: num_genes_at_node > num_species_in_pattern
        total_species_function = lambda num_species_at_node, num_species_in_pattern: num_species_at_node == num_species_in_pattern
        check_outgroup_function = lambda x: False
        negative_set = set()
    elif selector == "lineage":
        # lineage specific duplications: at least 1 gene
        exit_function = lambda num_genes_for_species: False
        keep_function = lambda num_genes_for_species: num_genes_for_species > 1
        total_genes_function = lambda num_genes_at_node, num_species_in_pattern: num_genes_at_node >= num_species_in_pattern
        total_species_function = lambda num_species_at_node, num_species_in_pattern: False
        check_outgroup_function = lambda x: False
        negative_set = set(range(nspecies)).difference(positive_set)
    elif selector == "any":
        # any number of orthologs, including
        # orphans
        exit_function = lambda num_genes_for_species: False
        keep_function = lambda num_genes_for_species: True
        total_genes_function = lambda num_genes_at_node, num_species_in_pattern: True
        total_species_function = lambda num_species_at_node, num_species_in_pattern: num_species_at_node == num_species_in_pattern
        check_outgroup_function = lambda x: False
        negative_set = set()
    elif selector == "outgroup":
        # group selector
        exit_function = lambda num_genes_for_species: False
        keep_function = lambda num_genes_for_species: num_genes_for_species > 0
        total_genes_function = lambda num_genes_at_node, num_species_in_pattern: False
        total_species_function = lambda num_species_at_node, num_species_in_pattern: False
        # check for outgrup: needs to have outgroup and at least one other species
        # ie.: sum of all genes in outgroups larger than sum of all genes in
        # all species
        if not outgroups:
            raise "usage error: please supply outgroups if 'outgroup'-selector is chosen."
        check_outgroup_function = lambda genes: 0 < sum(
            [len(genes[x])
             for x in outgroups]) < sum(map(lambda x: len(x), genes))
        negative_set = set()
    else:
        raise "unknown selector %s" % selector

    # work here: set genes[node_id] to None,
    # 1. if the gene count for a species of interest is > 1
    # 2. if the gene count for all species of interest is 1 in
    # the child node.

    if options.loglevel >= 5:
        options.stdlog.write("# gene tree\n")
        tree.display()

    n = TreeTools.GetSize(tree) + 1
    genes = []
    for x in range(n):
        genes.append([set() for x in range(nspecies)])

    ortholog_nodes = []

    def count_genes(node_id):
        """record number of genes per species for each node
        """
        node = tree.node(node_id)

        if options.loglevel >= 6:
            options.stdlog.write("# node_id=%i\n" % node_id)
            if options.loglevel >= 10:
                options.stdlog.write("# sets=%s\n" % (str(genes)))

        # species in pattern
        num_species_in_pattern = len(positive_set)

        if node.succ:
            # process non-leaf node
            for s in node.succ:

                # propagate: terminated nodes force upper nodes to terminate
                # (assigned to None).
                if not genes[s]:
                    genes[node_id] = None
                    return

                # total number of genes at node
                num_genes_at_node = 0
                # total number of species at node
                num_species_at_node = 0

                # compute new gene set for each species at node
                for x in positive_set:
                    genes[node_id][x] = genes[node_id][x].union(genes[s][x])

                    num_genes_for_species = len(genes[node_id][x])
                    if exit_function(num_genes_for_species):
                        genes[node_id] = None
                        return
                    num_genes_at_node += num_genes_for_species
                    if num_genes_for_species:
                        num_species_at_node += 1

            if options.loglevel >= 6:
                print "node=", node_id, "species_at_node", num_species_at_node, "genes_at_node=", num_genes_at_node, \
                    "num_genes_for_species=", num_genes_for_species, "ngenes=", sum(
                        map(lambda x: len(x), genes[node_id]))
                options.stdlog.write("# genes at node %i\t%s\n" %
                                     (node_id, genes[node_id]))
                if outgroups:
                    print sum([len(genes[node_id][x]) for x in outgroups])
                    print check_outgroup_function(genes[node_id])

            # check stop criterion
            if total_species_function(num_species_at_node,
                                      num_species_in_pattern):
                # check if positive requirements are fulfilled
                for x in positive_set:
                    if not keep_function(len(genes[node_id][x])):
                        if options.loglevel >= 6:
                            options.stdlog.write(
                                "# keep function false for species %i\n" % x)
                        break
                else:
                    if total_genes_function(num_genes_at_node,
                                            num_species_in_pattern):
                        if options.loglevel >= 6:
                            options.stdlog.write("# recording node %i\n" % x)
                        ortholog_nodes.append((node_id, genes[node_id]))
                genes[node_id] = None
                return
            elif check_outgroup_function(genes[node_id]):
                ortholog_nodes.append((node_id, genes[node_id]))
                genes[node_id] = None
                return
            elif negative_set:
                if total_genes_function(num_genes_at_node,
                                        num_species_in_pattern):
                    if options.loglevel >= 6:
                        options.stdlog.write("# recording node %i\n" % node_id)
                    ortholog_nodes.append((node_id, genes[node_id]))

        else:
            # process leaf
            s, t, g, q = parseIdentifier(node.data.taxon, options)
            c = options.org2column[s]
            if c in positive_set:
                genes[node_id][c].add(g)
            elif c in negative_set:
                genes[node_id] = None

    tree.dfs(tree.root, post_function=count_genes)

    return ortholog_nodes
Exemplo n.º 25
0
def processMali(mali, options):

    ncols = mali.getNumColumns()

    if ncols == 0:
        raise "refusing to process empty alignment."

    ## add annotation of states
    if options.block_size != None:
        if options.block_size < 1:
            size = int(float(ncols) / 3.0 * options.block_size) * 3
        else:
            size = int(options.block_size) * 3

        size = min(size, ncols)
        mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size))

    ## remove gene ids
    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename(id, species)

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True)

    ids = mali.getIdentifiers()
    xgram = XGram.XGram()

    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    # remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps(minimum_gaps=1, frame=3)

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r"))
        tree = nexus.trees[0]
        tree.relabel(map_old2new)
    else:
        tree = None

    annotation = mali.getAnnotation("STATE")
    chars = set(list(annotation))
    for c in chars:
        assert c in (
            "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized"
    if len(chars) == 1:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: only a single block")
        blocks = (("B0_", chars[0]), )
    else:
        blocks = (("B0_", "N"), ("B1_", "C"))

    result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks,
                                       options)

    trained_model = result.getModel()

    pis, matrices = RateEstimation.getRateMatrix(trained_model)

    annotation = mali.getAnnotation("STATE")

    for block, code in blocks:

        terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block)

        pi = pis[terminals]

        if options.shared_rates == "all":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa":
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "omega":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = block
        elif options.shared_rates == "omega-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = ""
        elif options.shared_rates == "ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        else:
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block

        if options.shared_frequencies:
            frequency_prefix = ""
        else:
            frequency_prefix = block

        rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs)
        rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn)
        ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri)
        rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv)

        nchars = annotation.count(code)

        msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % (
            result.getNumIterations(), rs, rn, ri, rv)

        try:
            Q, t = RateEstimation.getQMatrix(pi,
                                             Rsi=rs * ri,
                                             Rsv=rs * rv,
                                             Rni=rn * ri,
                                             Rnv=rn * rv)
            avg_omega = (rs + rn) / 2.0
            Q0, t0 = RateEstimation.getQMatrix(pi,
                                               Rsi=ri * avg_omega,
                                               Rsv=rv * avg_omega,
                                               Rni=ri * avg_omega,
                                               Rnv=rv * avg_omega)

            avg_kappa = (ri + rv) / 2.0
            Q1, t1 = RateEstimation.getQMatrix(pi,
                                               Rsi=rs * avg_kappa,
                                               Rsv=rs * avg_kappa,
                                               Rni=rn * avg_kappa,
                                               Rnv=rn * avg_kappa)

            rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q)
            rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0)
            rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1)

            dS = rS / (3 * rS0) * t
            dN = rN / (3 * rN0) * t

            o_kappa = options.value_format % (rI / rI0 * rV0 / rV)
            o_omega = options.value_format % (dN / dS)

            o_dn = options.value_format % dN
            o_ds = options.value_format % dS
            o_rn = options.value_format % rN
            o_rs = options.value_format % rS
            o_rn0 = options.value_format % rN0
            o_rs0 = options.value_format % rS0
            o_t = options.value_format % t
            o_t0 = options.value_format % t0

        except ZeroDivisionError:

            o_kappa = "na"
            o_omega = "na"
            o_dn = "na"
            o_ds = "na"
            o_rn = "na"
            o_rs = "na"
            o_rn0 = "na"
            o_rs0 = "na"
            o_t = "na"
            o_t0 = "na"
            Q = None
            msg = "insufficient data to estimate rate matrix."

        options.stdout.write("\t".join(
            map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na",
                      o_kappa, result.getLogLikelihood(), "na", nchars))))

        if options.with_rho:
            options.stdout.write(
                "\t" +
                "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0))))

        options.stdout.write("\t%s\n" % msg)
Exemplo n.º 26
0
    def calculateCoordinates(self):

        self.mNodeHeights = [0] * self.mNNodes
        self.mNodeWidthsStart = [0] * self.mNNodes
        self.mNodeWidthsEnd = [0] * self.mNNodes

        # if no scales are given, try to do best fit
        if self.mHeightScaleFactor == 0:
            rescale_height = True
            self.mHeightScaleFactor = 1
        else:
            rescale_height = False

        if self.mBranchScaleFactor == 0:
            rescale_width = True
            self.mBranchScaleFactor = 100
        else:
            rescale_width = False

        ##########################################################
        # Get Vertical coordinates
        # Label nodes by their height. Terminal nodes have integer coordinates.
        # Internal nodes have fractional coordinates (the average between the two
        # children)
        counter = [0]

        def updateHeights(node_id):
            l = len(self.mTree.node(node_id).succ)
            if l:
                # set node height for internal node
                t = 0
                for x in self.mTree.node(node_id).succ:
                    t += self.mNodeHeights[x]
                    # used to use use the following to take into account
                    # the height of symbols. This is wrong and better done by
                    # pre-traversal of the tree
                    # self.mNodeHeights[node_id] = float(t) / float(l) + max( self.mDecoratorInternalNodes.getHeight( node_id ), self.mDecoratorHorizontalBranches.getHeight( node_id ))
                    # instead: use uncorrected heights.
                    self.mNodeHeights[node_id] = float(t) / float(l)
            else:
                # set node height for external node
                self.mNodeHeights[node_id] = counter[0]
                counter[0] += max(self.mDecoratorExternalNodes.getHeight(node_id),
                                  self.mDecoratorHorizontalBranches.getHeight( node_id) ) \
                    * self.mHeightScaleFactor + self.mTerminalLabelSeparator

        TreeTools.TreeDFS(self.mTree,
                          self.mTree.root,
                          post_function=updateHeights)

        self.mMaxNodeHeight = counter[0]

        ##########################################################
        # Get horizontal coordinates
        def updateWidths(node_id):

            node = self.mTree.node(node_id)
            d = node.data.branchlength
            # set default branchlength to 0.01 for empty branch lengths
            # TODO: deal with trees without branch lengths later.
            if d <= 0.0:
                d = 0.01
            right = self.mNodeWidthsStart[node_id] + int(
                d * self.mBranchScaleFactor)
            self.mNodeWidthsEnd[node_id] = right
            for s in node.succ:
                self.mNodeWidthsStart[s] = right

        TreeTools.TreeDFS(self.mTree,
                          self.mTree.root,
                          pre_function=updateWidths)

        if rescale_height:
            m = max(self.mNodeHeights)
            f = float(self.mDefaultHeight) / m
            if 100 * f < 1:
                f = 0.01
            self.mHeightScaleFactor = 100 * f
            self.mNodeHeights = map(lambda x: int(x * f), self.mNodeHeights)
            self.mMaxNodeHeight *= f

        if rescale_width:
            m = max(self.mNodeWidthsEnd)
            f = float(self.mDefaultWidth) / m
            self.mBranchScaleFactor = 100 * f
            self.mNodeWidthsStart = map(lambda x: int(x * f),
                                        self.mNodeWidthsStart)
            self.mNodeWidthsEnd = map(lambda x: int(x * f),
                                      self.mNodeWidthsEnd)

        # add a safety margin for decorators writing above the line. This
        # is a patch and should be changed such that decorators report
        # their correct height

        for x in range(self.mNNodes):
            self.mNodeHeights[x] += 45
Exemplo n.º 27
0
        colour_by_species=None,
        tree=None,
        branch_scale=0,
        height_scale=0,
    )

    (options, args) = Experiment.Start(parser, add_pipe_options=True)

    if options.filename_tree:
        tree_lines = open(options.filename_tree, "r").readlines()
    elif options.tree:
        tree_lines = options.tree
    else:
        raise "please supply a species tree."

    nexus = TreeTools.Newick2Nexus(tree_lines)
    Tree.updateNexus(nexus)
    tree = nexus.trees[0]

    if options.loglevel >= 2:
        tree.display()

    plot = SVGTree(tree)

    plot.setBranchScale(options.branch_scale)
    plot.setHeightScale(options.height_scale)

    if options.colour_by_species:
        rx = re.compile(options.species_regex)
        extract_species = lambda x: rx.search(x).groups()[0]
        plot.setDecoratorExternalNodes(
Exemplo n.º 28
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--headers",
                      dest="headers",
                      action="store_true",
                      help="first row is a header [ignored].")
    parser.add_option("-t",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("-c",
                      "--contig-sizes",
                      dest="filename_contig_sizes",
                      type="string",
                      help="filname with contig sizes.")
    parser.add_option("-r",
                      "--radius",
                      dest="radius",
                      type="int",
                      help="radius.")
    parser.add_option("-i",
                      "--increment",
                      dest="radius_increment",
                      type="int",
                      help="radius increment.")
    parser.add_option("-u",
                      "--url",
                      dest="url",
                      type="string",
                      help="string to build url for annotation.")
    parser.add_option("--min-contig",
                      dest="min_contig_size",
                      type="string",
                      help="minimum contig size to delineate.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum branch length.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum branch length.")

    parser.set_defaults(
        filename_contig_sizes=None,
        headers=False,
        titles="",
        pattern_filename=None,
        title="",
        footer="",
        radius=3000,
        min_value=0.0,
        max_value=0.2,
        url=None,
        radius_increment=40,
        min_contig_size=10000,
        remove_empty_contigs=True,
        separator="|",
        quality2symbol={
            'CG': "circle",
            'PG': "circle",
            'SG': "circle"
        },
        quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG",
                      "UP", "UF", "BF", "UK"),
        sort_by_size=True,
        input_format="pairwise",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_contig_sizes:
        map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes,
                                               "r"),
                                          map_functions=(str, int))

    # read data and get contigs that are used (i.e.: remove empty contigs)
    chrs = {}
    lines = sys.stdin.readlines()

    if options.remove_empty_contigs:
        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
        for k in map_contig2size.keys():
            if k not in chrs:
                del map_contig2size[k]

    k = map_contig2size.keys()

    if len(k) == 0:
        E.Stop()
        sys.exit(0)

    k.sort()

    if options.sort_by_size:
        k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y]))

    plot = DuplicationPlot(k, map_contig2size, num_entries=0)

    plot.mRadiusIncrement = options.radius_increment
    plot.mRadius = options.radius
    plot.mMaxValue = options.max_value
    plot.mMinValue = options.min_value

    if options.title:
        plot.setTitle(options.title)
    if options.footer:
        plot.setFooter(options.footer)

    plot.initializePlot()

    data = []

    if options.input_format == "pairwise":

        # read data from pairwise analysis
        # format is: cluster_id, locations of duplications, tree of
        # duplications

        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            mi, ma = 0, 0
            found = False
            n = 0
            chrs = {}
            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
                sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)

                xi = plot.getPosition(chr, strand, sbjct_from)
                xa = plot.getPosition(chr, strand, sbjct_to)

                if not mi:
                    mi = xi
                else:
                    mi = min(mi, xi)

                n += 1
                ma = max(ma, xa)
                found = True

            if not found:
                continue
            cis = len(chrs) == 1
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# adding duplications in cluster %s: %s with tree %s\n" %
                    (cluster_id, in_locations, in_tree))
            data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree))

    data.sort()

    plot.mNumEntries = len(data)
    plot.initializePlot()

    last_ndups = 0

    for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]:

        if ndups != last_ndups:
            plot.pushRadius()
            plot.addSeparator()

        last_ndups = ndups

        map_gene2location = {}
        for l in in_locations.split(";"):
            gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
            if chr not in map_contig2size:
                continue
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
            map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to)

        if not map_gene2location:
            continue

        tree = TreeTools.Newick2Tree(in_tree)

        # the last subset is all nodes again.
        s = TreeTools.GetSubsets(tree)

        is_first = True
        for children, height, branchlength in s[:-1]:
            if len(children) == 1:
                continue
            c = map(lambda x: x.split(options.separator), children)
            plot.addDuplication(c,
                                map_gene2location,
                                height,
                                url=options.url,
                                with_separator=is_first,
                                link_to_previous=not is_first,
                                quality2symbol=options.quality2symbol,
                                quality2mask=options.quality2mask)
            is_first = False

    plot.writeToFile(sys.stdout)

    E.Stop()
Exemplo n.º 29
0
def processMali(mali, options):

    map_new2old = mali.mapIdentifiers()
    ids = mali.getIdentifiers()

    invalid_chars = options.gap_chars + options.mask_chars

    has_non_overlaps = False

    pairs = []

    if options.iteration == "all-vs-all":
        for x in range(len(ids)):
            for y in range(0, x):
                pairs.append((x, y))
    elif options.iteration == "first-vs-all":
        for y in range(1, len(ids)):
            pairs.append((0, y))
    elif options.iteration == "pairwise":
        if len(ids) % 2 != 0:
            raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len(
                ids)
        for x in range(0, len(ids), 2):
            pairs.append((x, x + 1))
    elif options.iteration == "tree":
        pairs = []
    else:
        raise "unknown iteration mode: %s" % (options.iteration)

    if options.remove_stops:
        for id, entry in mali.items():
            s = entry.mString.upper()
            fragments = []
            for x in range(0, len(s), 3):
                codon = s[x:x + 3]
                if Genomics.IsStopCodon(codon):
                    codon = "NNN"

                fragments.append(codon)

            entry.mString = "".join(fragments)

    for x, y in pairs:
        noverlap = 0
        for a, b in zip(mali[ids[x]], mali[ids[y]]):
            if a not in invalid_chars and b not in invalid_chars:
                noverlap += 1
                if noverlap >= options.min_overlap:
                    break
        else:
            has_non_overlaps = True
            break

    if options.tree:
        tree = TreeTools.Newick2Nexus(options.tree).trees[0]
        map_old2new = IOTools.getInvertedDictionary(map_new2old,
                                                    make_unique=True)
        tree.relabel(map_old2new)
    else:
        tree = None

    if options.method == "paml":
        runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options)

    elif options.method == "xrate":
        runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
Exemplo n.º 30
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: trees2tree.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("counts", "min", "max", "sum", "mean", "median", "stddev", "non-redundant", "consensus",
                               "select-largest"),
                      help="aggregation function.")

    parser.add_option("-r", "--regex-id", dest="regex_id", type="string",
                      help="regex pattern to extract identifier from tree name for the selection functions.")

    parser.add_option("-w", "--write-values", dest="write_values", type="string",
                      help="if processing multiple trees, write values to file.")

    parser.add_option("-e", "--error-branchlength", dest="error_branchlength", type="float",
                      help="set branch length without counts to this value.")

    parser.set_defaults(
        method="mean",
        regex_id=None,
        filtered_branch_lengths=(-999.0, 999.0),
        write_values = None,
        error_branchlength = None,
        separator=":",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.loglevel >= 2:
        options.stdlog.write("# reading trees from stdin.\n")
        options.stdlog.flush()

    nexus = TreeTools.Newick2Nexus(sys.stdin)
    if options.loglevel >= 1:
        options.stdlog.write(
            "# read %i trees from stdin.\n" % len(nexus.trees))

    nskipped = 0
    ninput = len(nexus.trees)
    noutput = 0
    nerrors = 0

    if options.method == "non-redundant":
        # compute non-redudant trees
        template_trees = []
        template_counts = []
        ntree = 0
        for tree in nexus.trees:

            for x in range(0, len(template_trees)):
                is_compatible, reason = TreeTools.IsCompatible(
                    tree, template_trees[x])
                if is_compatible:
                    template_counts[x] += 1
                    break
            else:
                template_counts.append(1)
                template_trees.append(tree)

            if options.loglevel >= 2:
                options.stdlog.write(
                    "# tree=%i, ntemplates=%i\n" % (ntree, len(template_trees)))

            ntree += 1

        for x in range(0, len(template_trees)):
            if options.loglevel >= 1:
                options.stdlog.write("# tree: %i, counts: %i, percent=%5.2f\n" %
                                     (x, template_counts[x], template_counts[x] * 100.0 / ntotal))
            options.stdout.write(
                TreeTools.Tree2Newick(template_trees[x]) + "\n")

    elif options.method in ("select-largest",):
        # select one of the trees with the same name.
        clusters = {}
        for x in range(0, len(nexus.trees)):
            n = nexus.trees[x].name

            if options.regex_id:
                n = re.search(options.regex_id, n).groups()[0]

            if n not in clusters:
                clusters[n] = []
            clusters[n].append(x)

        new_trees = []

        for name, cluster in clusters.items():
            new_trees.append(
                getBestTree([nexus.trees[x] for x in cluster], options.method))

        for x in range(0, len(new_trees)):
            options.stdout.write(">%s\n" % new_trees[x].name)
            options.stdout.write(TreeTools.Tree2Newick(new_trees[x],) + "\n")
            noutput += 1

        nskipped = ntotal - noutput

    elif options.method == "consensus":

        phylip = WrapperPhylip.Phylip()
        phylip.setLogLevel(options.loglevel - 2)
        phylip.setProgram("consense")
        phylip_options = []
        phylip_options.append("Y")

        phylip.setOptions(phylip_options)
        phylip.setTrees(nexus.trees)

        result = phylip.run()

        options.stdout.write(
            "# consensus tree built from %i trees\n" % (phylip.mNInputTrees))
        options.stdout.write(
            TreeTools.Tree2Newick(result.mNexus.trees[0]) + "\n")
        noutput = 1

    else:
        if options.method in ("min", "max", "sum", "mean", "counts"):

            xtree = nexus.trees[0]
            for n in xtree.chain.keys():
                if xtree.node(n).data.branchlength in options.filtered_branch_lengths:
                    xtree.node(n).data.branchlength = 0
                ntotals = [1] * len(xtree.chain.keys())

            if options.method == "min":
                f = min
            elif options.method == "max":
                f = max
            elif options.method == "sum":
                f = lambda x, y: x + y
            elif options.method == "mean":
                f = lambda x, y: x + y
            elif options.method == "counts":
                f = lambda x, y: x + 1
                for n in xtree.chain.keys():
                    if xtree.node(n).data.branchlength not in options.filtered_branch_lengths:
                        xtree.node(n).data.branchlength = 1
                    else:
                        xtree.node(n).data.branchlength = 0
            else:
                raise "unknown option %s" % options.method

            for tree in nexus.trees[1:]:

                for n in tree.chain.keys():
                    if tree.node(n).data.branchlength not in options.filtered_branch_lengths:
                        xtree.node(n).data.branchlength = f(
                            xtree.node(n).data.branchlength, tree.node(n).data.branchlength)
                        ntotals[n] += 1

            if options.method == "mean":
                for n in xtree.chain.keys():
                    if ntotals[n] > 0:
                        xtree.node(n).data.branchlength = float(
                            xtree.node(n).data.branchlength) / ntotals[n]
                    else:
                        if options.error_branchlength is not None:
                            xtree.node(
                                n).data.branchlength = options.error_branchlength
                            if options.loglevel >= 1:
                                options.stdlog.write(
                                    "# no counts for node %i - set to %f\n" % (n, options.error_branchlength))
                                nerrors += 1
                        else:
                            raise "no counts for node %i" % n

        else:
            # collect all values for trees
            values = [[] for x in range(TreeTools.GetSize(nexus.trees[0]))]

            for tree in nexus.trees:
                for n, node in tree.chain.items():
                    if node.data.branchlength not in options.filtered_branch_lengths:
                        values[n].append(node.data.branchlength)

            tree = nexus.trees[0]
            for n, node in tree.chain.items():
                if len(values[n]) > 0:
                    if options.method == "stddev":
                        node.data.branchlength = scipy.std(values[n])
                    elif options.method == "median":
                        node.data.branchlength = scipy.median(values[n])
                else:
                    if options.error_branchlength is not None:
                        node.data.branchlength = options.error_branchlength
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# no counts for node %i - set to %f\n" % (n, options.error_branchlength))
                            nerrors += 1
                    else:
                        raise "no counts for node %i" % n

            if options.write_values:
                outfile = open(options.write_values, "w")
                for n, node in tree.chain.items():
                    values[n].sort()
                    id = options.separator.join(
                        sorted(TreeTools.GetLeaves(tree, n)))
                    outfile.write("%s\t%s\n" %
                                  (id, ";".join(map(str, values[n]))))
                outfile.close()

        del nexus.trees[1:]
        options.stdout.write(TreeTools.Nexus2Newick(nexus) + "\n")
        noutput = 1

    if options.loglevel >= 1:
        options.stdlog.write("# ntotal=%i, nskipped=%i, noutput=%i, nerrors=%i\n" % (
            ninput, nskipped, noutput, nerrors))

    E.Stop()
Exemplo n.º 31
0
    if options.prefix:
        prefix_tree = ">%s\n" % options.prefix
        prefix_header = "prefix\t"
        prefix_row = "%s\t" % options.prefix
    else:
        prefix_tree = ""
        prefix_header = ""
        prefix_row = ""

    for method in options.methods:

        if method == "write-ks-tree":
            for result in results:
                options.stdout.write(prefix_tree +
                                     TreeTools.Tree2Newick(result.mTreeKs) +
                                     "\n")

        elif method == "write-ka-tree":
            for result in results:
                options.stdout.write(prefix_tree +
                                     TreeTools.Tree2Newick(result.mTreeKa) +
                                     "\n")

        elif method == "write-kaks-tree":
            for result in results:
                options.stdout.write(prefix_tree +
                                     TreeTools.Tree2Newick(result.mTreeKaks) +
                                     "\n")

        elif method == "lrt":
Exemplo n.º 32
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-p",
                      "--filename-patterns",
                      dest="filename_patterns",
                      type="string",
                      help="filename with patterns to output.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="filename with summary to output.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("map", "links", "trees"),
                      help="output format.")

    parser.add_option("-o",
                      "--organisms",
                      dest="column2org",
                      type="string",
                      help="sorted list of organisms.")

    parser.add_option(
        "-s",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")

    parser.add_option(
        "-g",
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.set_defaults(
        reference_tree=None,
        format="map",
        filename_patterns=None,
        column2org=None,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        separator="|",
        filename_summary=None,
    )

    (options, args) = E.Start(parser)

    if options.reference_tree:
        if options.reference_tree[0] == "(":
            nexus = TreeTools.Newick2Nexus(options.reference_tree)
        else:
            nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r"))
        reference_tree = nexus.trees[0]

        if options.loglevel >= 3:
            print "# reference tree:"
            print reference_tree.display()

    else:
        reference_tree = None

    clusters = {}
    if options.format == "map":

        for line in sys.stdin:
            if line[0] == "#": continue
            id, r = line[:-1].split("\t")
            if r not in clusters: clusters[r] = []
            clusters[r].append(id)

    elif options.format == "trees":

        nexus = TreeTools.Newick2Nexus(sys.stdin)

        for tree in nexus.trees:
            clusters[tree.name] = tree.get_taxa()

    elif options.format == "links":
        members = set()
        id = None
        for line in sys.stdin:
            if line[0] == "#": continue

            if line[0] == ">":
                if id: clusters[id] = members
                x = re.match(">cluster #(\d+)", line[:-1])
                if x:
                    id = x.groups()[0]
                else:
                    id = line[1:-1]
                members = set()
                continue

            data = line[:-1].split("\t")[:2]
            members.add(data[0])
            members.add(data[1])

        if id: clusters[id] = members

    if len(clusters) == 0:
        raise "empty input."

    ########################################################################
    ########################################################################
    ########################################################################
    ## sort out reference tree
    ########################################################################
    rs = re.compile(options.species_regex)
    rg = re.compile(options.gene_regex)
    extract_species = lambda x: rs.search(x).groups()[0]

    ## prune tree to species present
    species_set = set()
    for cluster, members in clusters.items():
        species_set = species_set.union(set(map(extract_species, members)))

    if reference_tree:

        TreeTools.PruneTree(reference_tree, species_set)

        if options.loglevel >= 1:
            options.stdlog.write("# Tree after pruning: %i taxa.\n" %
                                 len(reference_tree.get_taxa()))

    if options.column2org:
        options.column2org = options.column2org.split(",")
    elif reference_tree:
        options.column2org = []
        for nx in reference_tree.get_terminals():
            options.column2org.append(reference_tree.node(nx).get_data().taxon)
    else:
        options.column2org = []
        for x in species_set:
            options.column2org.append(x)

    options.org2column = {}
    for x in range(len(options.column2org)):
        options.org2column[options.column2org[x]] = x

    if reference_tree:
        reference_patterns = TreeTools.calculatePatternsFromTree(
            reference_tree, options.column2org)

        if options.loglevel >= 3:
            print "# reference patterns:"
            print reference_patterns

    ##############################################################################
    notus = len(options.column2org)
    patterns = {}
    species_counts = [SpeciesCounts() for x in options.column2org]

    ## first genes, then transcripts
    options.stdout.write(
        "mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" %
        ("\t".join(options.column2org), "\t".join(options.column2org)))

    keys = clusters.keys()
    keys.sort()
    for cluster in keys:
        members = clusters[cluster]

        count_genes = [{} for x in range(len(options.org2column))]
        count_transcripts = [0] * len(options.org2column)

        for m in members:
            data = m.split(options.separator)

            if len(data) == 4:
                s, t, g, q = data
            elif len(data) == 2:
                s, g = data
                t = g

            if s not in options.org2column:
                raise "unknown species %s" % s

            col = options.org2column[s]

            count_transcripts[col] += 1
            if g not in count_genes[col]:
                count_genes[col][g] = 0

            count_genes[col][g] += 1

            species_counts[col].mGenes.add(g)
            species_counts[col].mTranscripts.add(t)
            species_counts[col].mTrees.add(cluster)

        ntotal_transcripts = reduce(lambda x, y: x + y, count_transcripts)
        npresent_transcripts = len(filter(lambda x: x > 0, count_transcripts))
        ntotal_genes = reduce(lambda x, y: x + y, map(len, count_genes))
        npresent_genes = len(filter(lambda x: x > 0, map(len, count_genes)))

        pattern = GetPattern(count_transcripts, notus)
        if pattern not in patterns: patterns[pattern] = 0
        patterns[pattern] += 1
        options.stdout.write(
            string.join(
                (cluster, pattern, str(npresent_genes), str(ntotal_genes),
                 string.join(map(str, map(len, count_genes)), "\t"),
                 str(ntotal_transcripts),
                 string.join(map(str, count_transcripts), "\t")), "\t") + "\n")

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write pattern summary
    #######################################################################################
    xx = patterns.keys()
    xx.sort()
    if options.filename_patterns:
        outfile = open(options.filename_patterns, "w")
    else:
        outfile = sys.stdout

    for x in range(len(options.column2org)):
        outfile.write("# %i = %s\n" % (x, options.column2org[x]))

    if reference_tree:
        outfile.write("pattern\tcounts\tisok\n")
    else:
        outfile.write("pattern\tcounts\n")

    for x in xx:
        if reference_tree:
            if x in reference_patterns:
                is_ok = "1"
            else:
                is_ok = "0"
            outfile.write("%s\t%s\t%s\n" % (x, patterns[x], is_ok))
        else:
            outfile.write("%s\t%s\n" % (x, patterns[x]))

    if outfile != sys.stdout: outfile.close()

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write summary counts per species
    #######################################################################################
    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = sys.stdout

    outfile.write("species\tntranscripts\tngenes\tntrees\n")

    for species, col in options.org2column.items():
        outfile.write(
            "%s\t%i\t%i\t%i\n" %
            (species, len(species_counts[col].mTranscripts),
             len(species_counts[col].mGenes), len(species_counts[col].mTrees)))

    if outfile != sys.stdout: outfile.close()

    E.Stop()
Exemplo n.º 33
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $", 
                             usage = globals()["__doc__"])
    
    parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string",
                      help="reference tree to read."  )

    parser.add_option("-p", "--filename-patterns", dest="filename_patterns", type="string",
                      help="filename with patterns to output."  )

    parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string",
                      help="filename with summary to output."  )

    parser.add_option("-f", "--format", dest="format", type="choice",
                      choices=("map", "links", "trees"),
                      help="output format."  )

    parser.add_option( "-o", "--organisms", dest="column2org", type="string" ,
                       help="sorted list of organisms.")

    parser.add_option( "-s", "--species-regex", dest="species_regex", type="string" ,
                       help="regular expression to extract species from identifier.")

    parser.add_option( "-g", "--gene-regex", dest="gene_regex", type="string" ,
                       help="regular expression to extract gene from identifier.")

    

    parser.set_defaults(
        reference_tree = None,
        format="map",
        filename_patterns=None,
        column2org=None,
        species_regex ="^([^|]+)\|",
        gene_regex = "^[^|]+\|[^|]+\|([^|]+)\|",
        separator = "|",
        filename_summary = None,
        )

    (options, args) = E.Start( parser )

    if options.reference_tree:
        if options.reference_tree[0] == "(":
            nexus = TreeTools.Newick2Nexus( options.reference_tree )
        else:
            nexus = TreeTools.Newick2Nexus( open(options.reference_tree, "r") )
        reference_tree = nexus.trees[0]
    
        if options.loglevel >= 3:
            print "# reference tree:"
            print reference_tree.display()

    else:
        reference_tree = None

    clusters = {}
    if options.format == "map":
        
        for line in sys.stdin:
            if line[0] == "#": continue
            id, r = line[:-1].split("\t")
            if r not in clusters: clusters[r] = []
            clusters[r].append( id )
            
    elif options.format == "trees":

        nexus = TreeTools.Newick2Nexus( sys.stdin )

        for tree in nexus.trees:
            clusters[tree.name] = tree.get_taxa()

    elif options.format == "links":
        members = set()
        id = None
        for line in sys.stdin:
            if line[0] == "#": continue

            if line[0] == ">" :
                if id: clusters[id] = members
                x = re.match(">cluster #(\d+)", line[:-1] )
                if x:
                    id = x.groups()[0]
                else:
                    id = line[1:-1]
                members = set()
                continue
            
            data = line[:-1].split("\t")[:2]
            members.add( data[0] )
            members.add( data[1] )
            
        if id: clusters[id] = members
        
    if len(clusters) == 0:
        raise "empty input."

    ########################################################################
    ########################################################################
    ########################################################################
    ## sort out reference tree
    ########################################################################                
    rs = re.compile( options.species_regex )
    rg = re.compile( options.gene_regex )
    extract_species = lambda x: rs.search(x).groups()[0]

    ## prune tree to species present
    species_set = set()
    for cluster, members in clusters.items():
        species_set = species_set.union( set(map( extract_species, members) ) )

    if reference_tree:

        TreeTools.PruneTree( reference_tree, species_set )
    
        if options.loglevel >= 1:
            options.stdlog.write("# Tree after pruning: %i taxa.\n" % len(reference_tree.get_taxa()))

    if options.column2org:
        options.column2org = options.column2org.split(",")
    elif reference_tree:
        options.column2org = []
        for nx in reference_tree.get_terminals():
            options.column2org.append( reference_tree.node(nx).get_data().taxon )
    else:
        options.column2org = []
        for x in species_set:
            options.column2org.append( x )            

    options.org2column = {}
    for x in range(len(options.column2org)):
        options.org2column[options.column2org[x]] = x
    
    if reference_tree:
        reference_patterns = TreeTools.calculatePatternsFromTree( reference_tree, options.column2org )

        if options.loglevel >= 3:
            print "# reference patterns:"
            print reference_patterns

    ##############################################################################
    notus = len(options.column2org)
    patterns = {}
    species_counts = [ SpeciesCounts() for x in options.column2org ]
        
    ## first genes, then transcripts
    options.stdout.write("mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" % ("\t".join(options.column2org) , "\t".join(options.column2org) ))

    keys = clusters.keys()
    keys.sort()
    for cluster in keys:
        members = clusters[cluster]

        count_genes = [ {} for x in range(len(options.org2column)) ] 
        count_transcripts = [0] * len(options.org2column)                

        for m in members:
            data = m.split(options.separator)

            if len(data) == 4:
                s, t, g, q = data
            elif len(data) == 2:
                s, g = data
                t = g

            if s not in options.org2column:
                raise "unknown species %s" % s
            
            col = options.org2column[s]
            
            count_transcripts[col] += 1
            if g not in count_genes[col]:
                count_genes[col][g] = 0
            
            count_genes[col][g] += 1

            species_counts[col].mGenes.add( g )
            species_counts[col].mTranscripts.add( t )
            species_counts[col].mTrees.add( cluster )            

        ntotal_transcripts    = reduce( lambda x,y: x+y, count_transcripts)
        npresent_transcripts  = len(filter( lambda x: x > 0, count_transcripts))
        ntotal_genes          = reduce( lambda x,y: x+y, map(len, count_genes))
        npresent_genes        = len(filter( lambda x: x > 0, map(len,count_genes)))
                    
        pattern = GetPattern( count_transcripts, notus )
        if pattern not in patterns: patterns[pattern] = 0
        patterns[pattern] += 1
        options.stdout.write( string.join( (cluster, pattern, str(npresent_genes),
                                            str(ntotal_genes), 
                                            string.join( map(str, map(len, count_genes)), "\t"),
                                            str(ntotal_transcripts), 
                                            string.join( map(str, count_transcripts), "\t")), "\t") + "\n")
        

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write pattern summary
    #######################################################################################            
    xx = patterns.keys()
    xx.sort()
    if options.filename_patterns:
        outfile = open(options.filename_patterns, "w")
    else:
        outfile = sys.stdout

    for x in range(len(options.column2org)):
        outfile.write("# %i = %s\n" % (x, options.column2org[x]))

    if reference_tree:
        outfile.write("pattern\tcounts\tisok\n")
    else:
        outfile.write("pattern\tcounts\n")
        
    for x in xx:
        if reference_tree:
            if x in reference_patterns:
                is_ok = "1"
            else:
                is_ok = "0"
            outfile.write( "%s\t%s\t%s\n" % (x, patterns[x], is_ok) )
        else:
            outfile.write( "%s\t%s\n" % (x, patterns[x]) )            
    
    if outfile != sys.stdout: outfile.close()

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write summary counts per species
    #######################################################################################            
    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = sys.stdout
        
    outfile.write("species\tntranscripts\tngenes\tntrees\n")

    for species, col in options.org2column.items():
        outfile.write("%s\t%i\t%i\t%i\n" % ( species,
                                             len(species_counts[col].mTranscripts),
                                             len(species_counts[col].mGenes),
                                             len(species_counts[col].mTrees) ))
        
    if outfile != sys.stdout: outfile.close()
    
    E.Stop()