예제 #1
0
파일: stats.py 프로젝트: jeffhsu3/argweaver
def chi_square_fit(cdf, params, data, ndivs=20, minsamples=5, plot=False,
                   start=-util.INF, end=util.INF):

    from rasmus import gnuplot
    import scipy
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i+binsize] for i in xrange(0, len(data), binsize)]
    obs = scipy.array(map(len, bins))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)
    
    x = [bin[0] for bin in bins]
    expected = [len(data) * cdf(x[1], params)]
    expected.extend([len(data) *
                     (cdf(x[i+1], params) - cdf(x[i], params))
                     for i in range(1, len(x)-1)])
    expected.append(len(data) * (1.0 - cdf(x[-1], params)))
    expected = scipy.array(util.mget(expected, ind))
    
    chi2, pval = scipy.stats.chisquare(obs, expected)

    if plot:        
        p = gnuplot.plot(util.mget(x, ind), obs)
        p.plot(util.mget(x, ind), expected)
    
    return chi2, pval
예제 #2
0
파일: stats.py 프로젝트: ryneches/argweaver
def chi_square_fit(cdf, params, data, ndivs=20, minsamples=5, plot=False,
                   start=-util.INF, end=util.INF):

    from rasmus import gnuplot
    import scipy
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i+binsize] for i in range(0, len(data), binsize)]
    obs = scipy.array(list(map(len, bins)))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)

    x = [bin[0] for bin in bins]
    expected = [len(data) * cdf(x[1], params)]
    expected.extend([len(data) *
                     (cdf(x[i+1], params) - cdf(x[i], params))
                     for i in range(1, len(x)-1)])
    expected.append(len(data) * (1.0 - cdf(x[-1], params)))
    expected = scipy.array(util.mget(expected, ind))

    chi2, pval = scipy.stats.chisquare(obs, expected)

    if plot:
        p = gnuplot.plot(util.mget(x, ind), obs)
        p.plot(util.mget(x, ind), expected)

    return chi2, pval
예제 #3
0
파일: tablelib.py 프로젝트: sarab609/scraps
 def _write_directive(self, line, out, delim):
     """Write a directive"""
     
     if line == DIR_VERSION:
         out.write("##version:%s\n" % self.version)
     
     elif line == DIR_TYPES:
         if len(self) > 0:
             entry = self[0]
         else:
             entry = [""] * len(self.headers)
         out.write("##types:" +
                   self._type_lookup.formatTableTypes(
                         util.mget(self.types, self.headers),
                         delim) + "\n")
     elif line == DIR_DEFAULTS:
         out.write("##defaults:" +
                   delim.join(map(str, 
                             util.mget(self.defaults, self.headers))) + "\n")
     
     elif line == DIR_HEADERS:
         out.write("##headers:%d\n" % self.nheaders)
     
     else:
         raise "unknown directive:", line
예제 #4
0
파일: phylo.py 프로젝트: sarab609/scraps
 def walk(node):
     if node.isLeaf():
         return smap(node.name)
     else:
         child_hashes = map(walk, node.children)
         ind = util.sortrank(child_hashes)
         child_hashes = util.mget(child_hashes, ind)
         node.children = util.mget(node.children, ind)
         return hash_tree_compose(child_hashes)
예제 #5
0
    def init_distmats(self):
        """Initialize distance matrices
        
           Initialization should by done after trees and alignments
        """

        if len(self.distmats) > 0:
            self.matrices = []

            # setup matrices
            for i, distmat in enumerate(self.distmats):
                # convert distmatrix to summon Matrix
                if isinstance(distmat, matrix.Matrix):
                    mat = distmat
                else:
                    mat = matrix.Matrix()
                    mat.from2DList(distmat)

                # set default colormap
                if mat.colormap == None:
                    mat.colormap = self.matrix_colormap

                # determine labels
                if self.dist_labels_from_align and self.align_order != None:
                    # determine row/col labels from alignment if it exists
                    mat.rowlabels = self.align_order
                    mat.collabels = self.align_order

                elif self.distlabels != None:
                    mat.rowlabels = self.distlabels[i]
                    mat.collabels = self.distlabels[i]

                else:
                    raise Exception("no labels given for matrix")

                # reorder according to any given tree
                if self.order != None:
                    lookup = util.list2lookup(mat.rowlabels)
                    mat.rperm = util.mget(lookup, self.order)
                    mat.cperm = util.mget(lookup, self.order)

                mat.setup()

                self.matrices.append(mat)

            if self.seqs == None:
                seqs = self.current_align
            else:
                seqs = self.seqs

            # create matrix vis
            self.current_matrix = self.matrices[0]
            self.visdist = distmatrixvis.DistMatrixViewer(self.current_matrix,
                                                          seqs=seqs,
                                                          bgcolor=(1, 1, 1))
        else:
            self.visdist = None
    def init_distmats(self):
        """Initialize distance matrices
        
           Initialization should by done after trees and alignments
        """
    
        if len(self.distmats) > 0:
            self.matrices = []
            
            # setup matrices            
            for i, distmat in enumerate(self.distmats):
                # convert distmatrix to summon Matrix
                if isinstance(distmat, matrix.Matrix):
                    mat = distmat
                else:
                    mat = matrix.Matrix()
                    mat.from2DList(distmat)            

                # set default colormap
                if mat.colormap == None:
                    mat.colormap = self.matrix_colormap
                
                # determine labels
                if self.dist_labels_from_align and self.align_order != None:
                    # determine row/col labels from alignment if it exists
                    mat.rowlabels = self.align_order
                    mat.collabels = self.align_order
                
                elif self.distlabels != None:
                    mat.rowlabels = self.distlabels[i]
                    mat.collabels = self.distlabels[i]
                    
                else:
                    raise Exception("no labels given for matrix")
                
                # reorder according to any given tree
                if self.order != None:
                    lookup = util.list2lookup(mat.rowlabels)
                    mat.rperm = util.mget(lookup, self.order)
                    mat.cperm = util.mget(lookup, self.order)
                
                mat.setup()

                self.matrices.append(mat)
        
            if self.seqs == None:
                seqs = self.current_align
            else:
                seqs = self.seqs
            
            # create matrix vis
            self.current_matrix = self.matrices[0]
            self.visdist = distmatrixvis.DistMatrixViewer(self.current_matrix, 
                                                          seqs=seqs, 
                                                          bgcolor=(1,1,1))
        else:
            self.visdist = None
예제 #7
0
파일: ml.py 프로젝트: mdrasmus/spimap
    def test_ml_large(self):
        """Test ML code"""

        # params
        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/verts/19520/19520.ensembl.tree")
        align = fasta.readFasta("test/data/verts/19520/19520.nt.mfa")

        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa)
        print l
        self.assert_(l != -util.INF)

        l = spidir.find_ml_branch_lengths_hky(tree,
                                              util.mget(
                                                  align, tree.leafNames()),
                                              bgfreq,
                                              kappa,
                                              parsinit=False,
                                              maxiter=1)
        print l
        self.assert_(l != -util.INF)
예제 #8
0
파일: ml.py 프로젝트: mdrasmus/spimap
    def _test_ml_speed(self):

        # params
        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")

        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in xrange(10):
            l = spidir.find_ml_branch_lengths_hky(tree,
                                                  util.mget(
                                                      align, tree.leafNames()),
                                                  bgfreq,
                                                  kappa,
                                                  maxiter=10)
        util.toc()

        dists.append([n.dist for n in nodes])
        likes.append(l)
예제 #9
0
파일: stats.py 프로젝트: ryneches/argweaver
def fit_distrib(cdf, params_init, data, ndivs=20, minsamples=5,
                start=-util.INF, end=util.INF):

    import scipy
    import scipy.optimize
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i+binsize] for i in range(0, len(data), binsize)]
    obs = scipy.array(list(map(len, bins)))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)

    def optfunc(params):
        x = [bin[0] for bin in bins]
        expected = [len(data) * cdf(x[1], params)]
        expected.extend([len(data) *
                         (cdf(x[i+1], params) - cdf(x[i], params))
                         for i in range(1, len(x)-1)])
        expected.append(len(data) * (1.0 - cdf(x[-1], params)))
        expected = scipy.array(util.mget(expected, ind))

        chi2, pval = scipy.stats.chisquare(obs, expected)
        return chi2

    params = scipy.optimize.fmin(optfunc, params_init, disp=False)
    chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples)

    return list(params), pval
예제 #10
0
파일: tablelib.py 프로젝트: sarab609/scraps
 def lookup(self, *keys, **options):
     """Returns a lookup dict based on a column 'key'
        or multiple keys
        
        extra options:
        default=None
        uselast=False    # allow multiple rows, just use last
     """
     
     options.setdefault("default", None)
     options.setdefault("uselast", False)
     lookup = util.Dict(dim=len(keys), default=options["default"])
     uselast = options["uselast"]
     
     for row in self:
         keys2 = util.mget(row, keys)
         ptr = lookup
         for i in xrange(len(keys2) - 1):
             ptr = lookup[keys2[i]]
         if not uselast and keys2[-1] in ptr:
             raise Exception("duplicate key '%s'" % str(keys2[-1]))
         ptr[keys2[-1]] = row
     
     lookup.insert = False
     return lookup
예제 #11
0
파일: ml.py 프로젝트: Watermelon876/spimap
    def _test_ml_speed(self):
        
        # params
        bgfreq = [.258,.267,.266,.209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")


        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in xrange(10):
            l = spidir.find_ml_branch_lengths_hky(
                tree,
                util.mget(align, tree.leafNames()),
                bgfreq, kappa,
                maxiter=10)            
        util.toc()

        dists.append([n.dist for n in nodes])
        likes.append(l)
예제 #12
0
    def lookup(self, *keys, **options):
        """Returns a lookup dict based on a column 'key'
           or multiple keys
           
           extra options:
           default=None
           uselast=False    # allow multiple rows, just use last
        """

        options.setdefault("default", None)
        options.setdefault("uselast", False)
        lookup = util.Dict(dim=len(keys), default=options["default"])
        uselast = options["uselast"]

        for row in self:
            keys2 = util.mget(row, keys)
            ptr = lookup
            for i in xrange(len(keys2) - 1):
                ptr = lookup[keys2[i]]
            if not uselast and keys2[-1] in ptr:
                raise Exception("duplicate key '%s'" % str(keys2[-1]))
            ptr[keys2[-1]] = row

        lookup.insert = False
        return lookup
예제 #13
0
파일: phylo.py 프로젝트: sarab609/scraps
 def walk(node):
     node.recurse(walk)
     
     if not node.isLeaf():
         # this node's species is lca of children species  
         recon[node] = reconcile_lca(stree, order, 
                                    util.mget(recon, node.children))
예제 #14
0
파일: phylo.py 프로젝트: sarab609/scraps
def find_orthologs(gtree, stree, recon, counts=True):
    """Find all ortholog pairs within a gene tree"""

    events = label_events(gtree, recon)
    orths = []
    
    for node, event in events.items():
        if event == "spec":
            leavesmat = [x.leaves() for x in node.children]
            sp_counts = [util.hist_dict(util.mget(recon, row))
                         for row in leavesmat]
            
            for i in range(len(leavesmat)):
                for j in range(i+1, len(leavesmat)):
                    for gene1 in leavesmat[i]:
                        for gene2 in leavesmat[j]:
                            if gene1.name > gene2.name:
                                g1, g2 = gene2, gene1
                                a, b = j, i
                            else:
                                g1, g2 = gene1, gene2
                                a, b = i, j
                            
                            if not counts:
                                orths.append((g1.name, g2.name))
                            else:
                                orths.append((g1.name, g2.name,
                                              sp_counts[a][recon[g1]],
                                              sp_counts[b][recon[g2]]))
    
    return orths
예제 #15
0
def read_length_matrix(filename, minlen=.0001, maxlen=1.0,
                       nooutliers=True):
    """Read a length matrix made by spidir-prep"""

    from rasmus import util

    dat = [line.rstrip().split("\t") for line in open(filename)]
    species = dat[0][2:]
    lens = util.map2(float, util.submatrix(dat, range(1, len(dat)),
                                           range(2, len(dat[0]))))
    gene_sizes = map(int, util.cget(dat[1:], 1))
    files = util.cget(dat[1:], 0)

    if nooutliers:
        treelens = map(sum, lens)
        m = mean(treelens)
        ind = util.find(lambda x: x<5*m, treelens)
        files, gene_sizes, lens, treelens = [util.mget(x, ind) for x in
                                             files, gene_sizes, lens, treelens]



    for row in lens:
        for i in xrange(len(row)):
            if row[i] < minlen:
                row[i] = minlen

    
    return species, lens, gene_sizes, files
예제 #16
0
파일: ml.py 프로젝트: Watermelon876/spimap
    def test_ml_large(self):
        """Test ML code"""

        # params
        bgfreq = [.258,.267,.266,.209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/verts/19520/19520.ensembl.tree")
        align = fasta.readFasta("test/data/verts/19520/19520.nt.mfa")


        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        l = spidir.calc_seq_likelihood_hky(tree, align, bgfreq, kappa)
        print l
        self.assert_(l != -util.INF)


        l = spidir.find_ml_branch_lengths_hky(
            tree,
            util.mget(align, tree.leafNames()),
            bgfreq, kappa,
            parsinit=False,
            maxiter=1)
        print l
        self.assert_(l != -util.INF)
예제 #17
0
def read_length_matrix(filename, minlen=.0001, maxlen=1.0, nooutliers=True):
    """Read a length matrix made by spidir-prep"""

    from rasmus import util

    dat = [line.rstrip().split("\t") for line in open(filename)]
    species = dat[0][2:]
    lens = util.map2(
        float, util.submatrix(dat, range(1, len(dat)), range(2, len(dat[0]))))
    gene_sizes = map(int, util.cget(dat[1:], 1))
    files = util.cget(dat[1:], 0)

    if nooutliers:
        treelens = map(sum, lens)
        m = mean(treelens)
        ind = util.find(lambda x: x < 5 * m, treelens)
        files, gene_sizes, lens, treelens = [
            util.mget(x, ind) for x in files, gene_sizes, lens, treelens
        ]

    for row in lens:
        for i in xrange(len(row)):
            if row[i] < minlen:
                row[i] = minlen

    return species, lens, gene_sizes, files
예제 #18
0
파일: stats.py 프로젝트: jeffhsu3/argweaver
def fit_distrib(cdf, params_init, data, ndivs=20, minsamples=5,
                start=-util.INF, end=util.INF):

    import scipy
    import scipy.optimize
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i+binsize] for i in xrange(0, len(data), binsize)]
    obs = scipy.array(map(len, bins))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)
    
    def optfunc(params):
        x = [bin[0] for bin in bins]
        expected = [len(data) * cdf(x[1], params)]
        expected.extend([len(data) *
                         (cdf(x[i+1], params) - cdf(x[i], params))
                         for i in range(1, len(x)-1)])
        expected.append(len(data) * (1.0 - cdf(x[-1], params)))
        expected = scipy.array(util.mget(expected, ind))
        
        chi2, pval = scipy.stats.chisquare(obs, expected)
        return chi2

    params = scipy.optimize.fmin(optfunc, params_init, disp=False)
    chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples)

    return list(params), pval
예제 #19
0
    def on_reorder_leaves(self):
        leaves = self.current_tree.leaf_names()

        # reorder matrix
        for mat in self.matrices:
            lookup = util.list2lookup(mat.rowlabels)
            mat.rperm = util.mget(lookup, leaves)
            mat.cperm = util.mget(lookup, leaves)
            mat.setup()
        if self.visdist:
            self.visdist.redraw()

        # reorder alignment
        for aln in self.aligns:
            aln.names = leaves
        if self.visalign:
            self.visalign.show()
예제 #20
0
파일: ml.py 프로젝트: mdrasmus/spimap
    def _test_ml(self):
        """Test ML code"""

        # params
        bgfreq = [.258, .267, .266, .209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")

        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in range(40):
            l = spidir.find_ml_branch_lengths_hky(tree,
                                                  util.mget(
                                                      align, tree.leafNames()),
                                                  bgfreq,
                                                  kappa,
                                                  parsinit=False,
                                                  maxiter=1)

            dists.append([n.dist for n in nodes])
            likes.append(l)
        util.toc()

        print likes

        prep_dir("test/output/ml/")

        # distances plot
        util.rplot_start("test/output/ml/ml_branches.pdf")
        util.rplot("plot",
                   util.cget(dists, 0),
                   ylim=[0, max(dists[0])],
                   t="l",
                   main="branch length convergence",
                   xlab="iterations",
                   ylab="branch lengths (sub/site)")
        for d in zip(*dists):
            util.rplot("lines", d)
        util.rplot_end(True)

        print util.cget(dists, 4)

        # likelihood plot
        util.rplot_start("test/output/ml/ml_likelihood.pdf")
        util.rplot("plot",
                   likes,
                   t="l",
                   xlab="iterations",
                   ylab="log likelihood",
                   main="likelihood convergence")
        util.rplot_end(True)
예제 #21
0
파일: __init__.py 프로젝트: sarab609/scraps
def parsimony_C(aln, tree):    
    ptree, nodes, nodelookup = makePtree(tree)
    leaves = [x.name for x in nodes if isinstance(x.name, str)]
    seqs = util.mget(aln, leaves)
    
    dists = pyspidir.parsimony(ptree, seqs)
    
    for i in xrange(len(dists)):
        nodes[i].dist = dists[i]
 def on_reorder_leaves(self):
     leaves = self.current_tree.leaf_names()
     
     # reorder matrix
     for mat in self.matrices:
         lookup = util.list2lookup(mat.rowlabels)
         mat.rperm = util.mget(lookup, leaves)
         mat.cperm = util.mget(lookup, leaves)
         mat.setup()
     if self.visdist:
         self.visdist.redraw()
     
     
     # reorder alignment
     for aln in self.aligns:
         aln.names = leaves
     if self.visalign:
         self.visalign.show()
예제 #23
0
파일: alignlib.py 프로젝트: ongkong/compbio
def make_degen_str(aln):
    """Returns a string containing the degeneracy for each column 
       in an alignment
    """

    degens = find_degen(aln)
    degenmap = {-1: " ", 0: "0", 1: "1", 2: "2", 3: "3", 4: "4"}

    return "".join(util.mget(degenmap, degens))
예제 #24
0
파일: stats.py 프로젝트: jeffhsu3/argweaver
 def optfunc(params):
     x = [bin[0] for bin in bins]
     expected = [len(data) * cdf(x[1], params)]
     expected.extend([len(data) *
                      (cdf(x[i+1], params) - cdf(x[i], params))
                      for i in range(1, len(x)-1)])
     expected.append(len(data) * (1.0 - cdf(x[-1], params)))
     expected = scipy.array(util.mget(expected, ind))
     
     chi2, pval = scipy.stats.chisquare(obs, expected)
     return chi2
예제 #25
0
파일: __init__.py 프로젝트: sarab609/scraps
def mlhkydist_C(aln, tree, bgfreq, ratio, maxiter):
    ptree, nodes, nodelookup = makePtree(tree)
    leaves = [x.name for x in nodes if isinstance(x.name, str)]
    seqs = util.mget(aln, leaves)
    
    dists, logl = pyspidir.mlhkydist(ptree, seqs, bgfreq, ratio, maxiter)
    
    for i in xrange(len(dists)):
        nodes[i].dist = dists[i]
    
    return logl
예제 #26
0
파일: ml.py 프로젝트: Watermelon876/spimap
    def _test_ml(self):
        """Test ML code"""

        # params
        bgfreq = [.258,.267,.266,.209]
        kappa = 1.59

        # data
        tree = treelib.readTree("test/data/flies.nt/0/0.tree")
        align = fasta.readFasta("test/data/flies.nt/0/0.align")


        likes = []
        dists = []

        nodes = sorted(tree.nodes.values(), key=lambda x: x.dist)

        util.tic("find ML")
        for i in range(40):
            l = spidir.find_ml_branch_lengths_hky(
                    tree,
                    util.mget(align, tree.leafNames()),
                    bgfreq, kappa,
                    parsinit=False,
                    maxiter=1)
            
            dists.append([n.dist for n in nodes])
            likes.append(l)
        util.toc()

        print likes

        prep_dir("test/output/ml/")

        # distances plot
        util.rplot_start("test/output/ml/ml_branches.pdf")
        util.rplot("plot", util.cget(dists, 0),
                   ylim=[0, max(dists[0])], t="l",
                   main="branch length convergence",
                   xlab="iterations",
                   ylab="branch lengths (sub/site)")
        for d in zip(* dists):
            util.rplot("lines", d)
        util.rplot_end(True)

        print util.cget(dists, 4)

        # likelihood plot
        util.rplot_start("test/output/ml/ml_likelihood.pdf")
        util.rplot("plot", likes, t="l",
                   xlab="iterations",
                   ylab="log likelihood",
                   main="likelihood convergence")
        util.rplot_end(True)
예제 #27
0
파일: stats.py 프로젝트: ryneches/argweaver
    def optfunc(params):
        x = [bin[0] for bin in bins]
        expected = [len(data) * cdf(x[1], params)]
        expected.extend([len(data) *
                         (cdf(x[i+1], params) - cdf(x[i], params))
                         for i in range(1, len(x)-1)])
        expected.append(len(data) * (1.0 - cdf(x[-1], params)))
        expected = scipy.array(util.mget(expected, ind))

        chi2, pval = scipy.stats.chisquare(obs, expected)
        return chi2
    def draw_matches(self, sp, chrom, start, end, drawn=None):
        vis = []

        if drawn is None:
            drawn = set()
        
        # build list of matches in order of drawing
        
        for gene in iter_chrom(self.db.get_regions(sp, chrom), start, end):
            # need to sort matches by genome order so that mult-genome synteny
            # is drawn top-down

            # get orthologs
            genes2 = [x for x in self.orth_lookup.get(gene.data["ID"], [])
                      if x in self.region_layout]
            if len(genes2) == 0:
                continue
            
            rows = util.groupby(lambda x: self.region_layout[x].y, genes2)
            keys = util.sort(rows.keys(), reverse=True)
            rows = util.mget(rows, keys)

            l = self.region_layout
            
            for i in range(1, len(rows)):
                for botGene in rows[i]:
                    gene1 = self.db.get_region(botGene)
                    for topGene in rows[i-1]:

                        if (botGene, topGene) in drawn:
                            continue

                        drawn.add((botGene, topGene))
                        
                        gene2 = self.db.get_region(topGene)
                        y1 = l[topGene].y 
                        y2 = l[botGene].y + 1
                        x1 = l[topGene].x
                        x2 = l[topGene].x + gene2.length()
                        x3 = l[botGene].x + gene1.length()
                        x4 = l[botGene].x
                        
                        if self.fat_matches:
                            vis.append(quads(
                                    self.colors["matches"],
                                    x1, y1,
                                    x2, y1,
                                    x3, y2,
                                    x4, y2))

                        vis.append(lines(self.colors["matches"],
                                         x1, y1,
                                         x4, y2))
        return group(* vis)
예제 #29
0
파일: stats.py 프로젝트: jeffhsu3/argweaver
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5):
    sizes = [xbins[i+1] - xbins[i] for i in xrange(len(xbins)-1)]
    sizes.append(sizes[-1]) # NOTE: assumes bins are of equal size
    
    # only focus on bins that are large enough
    counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins)-1)]
    
    expected = []
    for i in xrange(len(xbins)-1):
        expected.append((func(xbins[i]) + func(xbins[i+1]))/2.0 * 
                         sizes[i] * nsamples)
        
    # ensure we have enough expected samples in each bin
    ind = util.find(util.gefunc(minsamples), expected)
    counts = util.mget(counts, ind)
    expected = util.mget(expected, ind)
    
    if len(counts) == 0:
        return [0, 1], counts, expected
    else:
        return chiSquare([counts], [expected], nparams), counts, expected
예제 #30
0
파일: stats.py 프로젝트: mdrasmus/spimap
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5):
    sizes = [xbins[i + 1] - xbins[i] for i in xrange(len(xbins) - 1)]
    sizes.append(sizes[-1])  # NOTE: assumes bins are of equal size

    # only focus on bins that are large enough
    counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins) - 1)]

    expected = []
    for i in xrange(len(xbins) - 1):
        expected.append(
            (func(xbins[i]) + func(xbins[i + 1])) / 2.0 * sizes[i] * nsamples)

    # ensure we have enough expected samples in each bin
    ind = util.find(util.gefunc(minsamples), expected)
    counts = util.mget(counts, ind)
    expected = util.mget(expected, ind)

    if len(counts) == 0:
        return [0, 1], counts, expected
    else:
        return chiSquare([counts], [expected], nparams), counts, expected
예제 #31
0
    def draw_matches(self, sp, chrom, start, end, drawn=None):
        vis = []

        if drawn is None:
            drawn = set()

        # build list of matches in order of drawing

        for gene in iter_chrom(self.db.get_regions(sp, chrom), start, end):
            # need to sort matches by genome order so that mult-genome synteny
            # is drawn top-down

            # get orthologs
            genes2 = [
                x for x in self.orth_lookup.get(gene.data["ID"], [])
                if x in self.region_layout
            ]
            if len(genes2) == 0:
                continue

            rows = util.groupby(lambda x: self.region_layout[x].y, genes2)
            keys = util.sort(rows.keys(), reverse=True)
            rows = util.mget(rows, keys)

            l = self.region_layout

            for i in range(1, len(rows)):
                for botGene in rows[i]:
                    gene1 = self.db.get_region(botGene)
                    for topGene in rows[i - 1]:

                        if (botGene, topGene) in drawn:
                            continue

                        drawn.add((botGene, topGene))

                        gene2 = self.db.get_region(topGene)
                        y1 = l[topGene].y
                        y2 = l[botGene].y + 1
                        x1 = l[topGene].x
                        x2 = l[topGene].x + gene2.length()
                        x3 = l[botGene].x + gene1.length()
                        x4 = l[botGene].x

                        if self.fat_matches:
                            vis.append(
                                quads(self.colors["matches"], x1, y1, x2, y1,
                                      x3, y2, x4, y2))

                        vis.append(
                            lines(self.colors["matches"], x1, y1, x4, y2))
        return group(*vis)
예제 #32
0
def query_point_regions(point, regions, inc=True):

    ind = util.sortindex(regions, key=lambda r: r[1])
    rind = util.mget(range(len(regions)), ind)
    regions_by_end = util.mget(regions, ind)

    end = util.binsearch([r[0] for r in regions], x)[1]
    start = util.binsearch([r[1] for r in regions_by_end], x)[0]

    if start is None:
        start = 0
    if end is None:
        end = len(regions)

    if inc:
        for i in xrange(start, end):
            if regions[i][0] <= x <= regions[i][1]:
                yield regions[i]
    else:
        for i in xrange(start, end):
            if regions[i][0] < x < regions[i][1]:
                yield regions[i]
def query_point_regions(point, regions, inc=True):

    ind = util.sortindex(regions, key=lambda r: r[1])
    rind = util.mget(range(len(regions)), ind)
    regions_by_end = util.mget(regions, ind)

    end = util.binsearch([r[0] for r in regions], x)[1]
    start = util.binsearch([r[1] for r in regions_by_end], x)[0]

    if start is None:
        start = 0
    if end is None:
        end = len(regions)

    if inc:
        for i in xrange(start, end):
            if regions[i][0] <= x <= regions[i][1]:
                yield regions[i]
    else:
        for i in xrange(start, end):
            if regions[i][0] < x < regions[i][1]:
                yield regions[i]
예제 #34
0
    def walk(node):
        if node in leaves:
            colors[node] = phylo.hash_tree(node, gene2species)
        else:
            # recurse
            for child in node.children:
                walk(child)

            childHashes = util.mget(colors, node.children)
            if len(childHashes) > 1 and util.equal(*childHashes):
                nmirrors[0] += 1

            childHashes.sort()
            colors[node] = phylo.hash_tree_compose(childHashes)
예제 #35
0
def make_pep_colors(prop2color=prop2color):
    pep_colors = util.Dict(default=color(.5, .5, .5))

    AA = 'ARNDCEQGHILKMFPSTWYVU*'
    pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA))

    prop_counts = util.Dict(default=0)
    for char in AA:
        prop = seqlib.AA_PROPERTY[char]
        tint = prop_counts[prop] / float(pep_per_prop[prop])
        pep_colors[char] = prop2color(prop, tint * .5)
        prop_counts[prop] += 1

    return pep_colors
예제 #36
0
    def walk(node):
        if node in leaves:
            colors[node] = phylo.hash_tree(node, gene2species)
        else:
            # recurse
            for child in node.children:
                walk(child)

            childHashes = util.mget(colors, node.children)
            if len(childHashes) > 1 and util.equal(*childHashes):
                nmirrors[0] += 1

            childHashes.sort()
            colors[node] = phylo.hash_tree_compose(childHashes)
def make_pep_colors(prop2color=prop2color):
    pep_colors = util.Dict(default=color(.5, .5, .5))

    AA = 'ARNDCEQGHILKMFPSTWYVU*'
    pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA))

    prop_counts = util.Dict(default=0)
    for char in AA:
        prop = seqlib.AA_PROPERTY[char]
        tint = prop_counts[prop] / float(pep_per_prop[prop])
        pep_colors[char] = prop2color(prop, tint * .5)
        prop_counts[prop] += 1
    
    return pep_colors
예제 #38
0
def make_degen_str(aln):
    """Returns a string containing the degeneracy for each column 
       in an alignment
    """

    degens = find_degen(aln)
    degenmap = {-1: " ",
                 0: "0",
                 1: "1",
                 2: "2",
                 3: "3",
                 4: "4"}
    
    return "".join(util.mget(degenmap, degens))
예제 #39
0
    def _write_directive(self, line, out, delim):
        """Write a directive"""

        if line == DIR_VERSION:
            out.write("##version:%s\n" % self.version)

        elif line == DIR_TYPES:
            if len(self) > 0:
                entry = self[0]
            else:
                entry = [""] * len(self.headers)
            out.write("##types:" + self._type_lookup.formatTableTypes(
                util.mget(self.types, self.headers), delim) + "\n")
        elif line == DIR_DEFAULTS:
            out.write(
                "##defaults:" +
                delim.join(map(str, util.mget(self.defaults, self.headers))) +
                "\n")

        elif line == DIR_HEADERS:
            out.write("##headers:%d\n" % self.nheaders)

        else:
            raise "unknown directive:", line
예제 #40
0
파일: genecall.py 프로젝트: ongkong/compbio
def findFragments(regiondb, aln, overlapCutoff=.10):
    """Determine if alignment has gene fragments"""
    
    aln_genes = util.mget(regiondb.regions, aln.keys())
    nbrs = findNeighbors(regiondb, aln_genes)
    frags = []
    
    # are there any neighbors?
    if max(map(len, nbrs)) > 1:
        # do neighbors overlap in alignment?
        for nbr in nbrs:
            if len(nbr) > 1:
                aln2 = aln.get(x.data['ID'] for x in nbr)
                frags.extend(findMerges(aln2, overlapCutoff=overlapCutoff))
    return frags
예제 #41
0
    def get_aligns(self,
                   species,
                   chrom,
                   start,
                   end,
                   mainspecies=lambda keys: keys[0],
                   collapse=False):
        """By default assumes main species is 1st sequence"""

        # get records for this region
        records = self.get(species, chrom, start, end)
        records.sort(key=lambda x: x["start"])

        # read alignments
        alns = []
        for record in records:
            aln = fasta.read_fasta(record["filename"])

            # collapse alignment
            if collapse:
                ind = util.findneq("-", aln[mainspecies(aln.keys())])

                for key, seq in aln.iteritems():
                    if len(seq) != 0:
                        aln[key] = "".join(util.mget(seq, ind))

            l2a = alignlib.local2align(aln[mainspecies(aln.keys())])

            # trim front
            if start > record["start"]:
                trimstart = l2a[start - record["start"]]
            else:
                trimstart = 0

            # trim end
            if end < record["end"]:
                trimend = l2a[-(record["end"] - end)]
            else:
                trimend = aln.alignlen()

            # perform trim
            for key, seq in aln.iteritems():
                aln[key] = seq[trimstart:trimend]

            alns.append(aln)

        return alns
예제 #42
0
파일: __init__.py 프로젝트: sarab609/scraps
def learnModel(trees, stree, gene2species, statsprefix="", filenames=None):
    util.tic("learn model")

    util.tic("find branch length distributions")
    lengths, used = phylo.find_branch_distrib(trees, stree, gene2species, False)
    debug("Total trees matching species topology: %d out of %d" % 
          (sum(used), len(trees)))
    util.toc()
    
    params = {}
    
    totlens = map(sum, zip(* lengths.values()))
    
    # print output stats
    if statsprefix != "":
        writeTreeDistrib(file(statsprefix + ".lens", "w"), lengths)
        rates = treeDistrib2table(lengths, filenames=filenames)
        rates.write(statsprefix + "_rates.tab")
    
    
    util.tic("fitting params")
    for node, lens in lengths.items():
        if len(lens) == 0 or max(lens) == min(lens):
            continue
        
        util.tic("fitting params for " + str(node.name))
        param = fitNormal2(util.vdiv(lens, totlens))
        
        params[node.name] = param
        util.toc()
    util.toc()
    
    # calc distribution of total tree length
    trees2 = util.mget(trees, util.findeq(True, used))
    lens = map(lambda x: sum(y.dist for y in x.nodes.values()), trees2)
    lens = filter(lambda x: x < 20, lens)
    mu = stats.mean(lens)
    lens = filter(lambda x: x < 2*mu, lens)
    mu = stats.mean(lens)
    sigma2 = stats.variance(lens)
    params["baserate"] = [mu*mu/sigma2, mu/sigma2]
    params[stree.root.name] = [0, 1]
    
    util.toc()
    
    return params
예제 #43
0
def find_xenologs(gtree,
                  stree,
                  recon,
                  events,
                  trans,
                  counts=True,
                  species_branch=False):
    """Find all xenolog pairs within a gene tree

    NOTE: THIS HAS NOT BEEN TESTED!!!
    """
    xenos = []

    for node, event in events.items():
        if event == "trans":
            assert len(node.children) == 2
            if trans[node] == node.children[0]:
                children = (node.children[1], node.children[0])
            else:
                children = node.children
            leavesmat = [x.leaves() for x in children]
            sp_counts = [
                util.hist_dict(util.mget(recon, row)) for row in leavesmat
            ]

            for i in range(len(leavesmat)):
                for j in range(i + 1, len(leavesmat)):
                    for gene1 in leavesmat[i]:
                        for gene2 in leavesmat[j]:
                            g1, g2 = gene1, gene2
                            a, b = i, j

                            xeno = [g1.name, g2.name]
                            if counts:
                                xeno.extend([
                                    sp_counts[a][recon[g1]],
                                    sp_counts[b][recon[g2]]
                                ])
                            if species_branch:
                                xeno.append(recon[node])
                            xenos.append(tuple(xenos))

    return xenos
    def get_aligns(self, species, chrom, start, end,
                   mainspecies=lambda keys: keys[0],
                   collapse=False):
        """By default assumes main species is 1st sequence"""

        # get records for this region
        records = self.get(species, chrom, start, end)
        records.sort(key=lambda x: x["start"])

        # read alignments
        alns = []
        for record in records:
            aln = fasta.read_fasta(record["filename"])

            # collapse alignment
            if collapse:
                ind = util.findneq("-", aln[mainspecies(aln.keys())])

                for key, seq in aln.iteritems():
                    if len(seq) != 0:
                        aln[key] = "".join(util.mget(seq, ind))

            l2a = alignlib.local2align(aln[mainspecies(aln.keys())])

            # trim front
            if start > record["start"]:
                trimstart = l2a[start - record["start"]]
            else:
                trimstart = 0

            # trim end
            if end < record["end"]:
                trimend = l2a[-(record["end"]-end)]
            else:
                trimend = aln.alignlen()

            # perform trim
            for key, seq in aln.iteritems():
                aln[key] = seq[trimstart:trimend]

            alns.append(aln)

        return alns
예제 #45
0
    def write(self, out, fullpage=False):
        """Write HTML table"""
        out = util.open_stream(out, "w")

        if fullpage:
            out.write("<html>")

        if self.title:
            out.write("<head><title>%s</title></head>\n" % self.title)

        out.write(
            "<style>.tab { border-right: 1px solid #777; border-bottom: 1px solid #777;}</style>"
        )

        if self.title is not None:
            out.write("<h1>%s</h1>" % self.title)

        # write headers
        out.write("<table cellspacing=0 style='border: 1px solid black;'>\n")
        out.write("<tr><td class='tab'><b>#</b></td>")
        for header in self.headers:
            out.write("<td class='tab'><b>%s</b></td>" % header)
        out.write("</tr>\n")

        # write rows
        for i, row in enumerate(self.table):
            out.write("<tr><td class='tab'>%d.</td>" % (i + 1))
            for j, item in enumerate(util.mget(row, self.table.headers)):

                if self.formats[j] is not None:
                    # write formating
                    out.write("<td class='tab'>%s&nbsp;</td>" %
                              self.formats[j](item))
                else:
                    out.write("<td class='tab'><nobr>%s&nbsp;</nobr></td>" %
                              str(item))
            out.write("</tr>\n")

        out.write("</table>")

        if fullpage:
            out.write("</html>")
    def write(self, out, fullpage=False):
        """Write HTML table"""
        out = util.open_stream(out, "w")

        if fullpage:
            out.write("<html>")

        if self.title:
            out.write("<head><title>%s</title></head>\n" % self.title)

        out.write("<style>.tab { border-right: 1px solid #777; border-bottom: 1px solid #777;}</style>")  # nopep8

        if self.title is not None:
            out.write("<h1>%s</h1>" % self.title)

        # write headers
        out.write("<table cellspacing=0 style='border: 1px solid black;'>\n")
        out.write("<tr><td class='tab'><b>#</b></td>")
        for header in self.headers:
            out.write("<td class='tab'><b>%s</b></td>" % header)
        out.write("</tr>\n")

        # write rows
        for i, row in enumerate(self.table):
            out.write("<tr><td class='tab'>%d.</td>" % (i+1))
            for j, item in enumerate(util.mget(row, self.table.headers)):

                if self.formats[j] is not None:
                    # write formating
                    out.write("<td class='tab'>%s&nbsp;</td>" %
                              self.formats[j](item))
                else:
                    out.write(
                        "<td class='tab'><nobr>%s&nbsp;</nobr></td>" %
                        str(item))
            out.write("</tr>\n")

        out.write("</table>")

        if fullpage:
            out.write("</html>")
예제 #47
0
    def get_matrix(self, rowheader="rlabels"):
        """Returns mat, rlabels, clabels

           where mat is a copy of the table as a 2D list
                 rlabels are the row labels
                 clabels are the column labels
        """
        # get labels
        if rowheader is not None and rowheader in self.headers:
            rlabels = self.cget(rowheader)
            clabels = copy.copy(self.headers)
            clabels.remove(rowheader)
        else:
            rlabels = range(len(self))
            clabels = copy.copy(self.headers)

        # get data
        mat = []
        for row in self:
            mat.append(util.mget(row, clabels))

        return mat, rlabels, clabels
예제 #48
0
파일: phylo.py 프로젝트: sarab609/scraps
def getRelBranchLens(rates, species=None):
    if species == None:
        species = rates.headers
    
    nonspecies = set(rates.headers) - set(species)
    
    relrates = rates.new()
    
    for row in rates:
        row2 = {}
        tot = sum(util.mget(row, species))
        
        for sp in species:
            row2[sp] = row[sp] / tot
        
        # copy over non-species data
        for key in nonspecies:
            row2[key] = row[key]
        
        relrates.append(row2)
    
    return relrates
예제 #49
0
def num_redundant_topology(node, gene2species, leaves=None, all_leaves=False):
    """Returns the number of 'redundant' topologies"""

    if leaves is None:
        leaves = node.leaves()
    leaves = set(leaves)
    colors = {}
    nmirrors = [0]

    def walk(node):
        if node in leaves:
            colors[node] = phylo.hash_tree(node, gene2species)
        else:
            # recurse
            for child in node.children:
                walk(child)

            childHashes = util.mget(colors, node.children)
            if len(childHashes) > 1 and util.equal(*childHashes):
                nmirrors[0] += 1

            childHashes.sort()
            colors[node] = phylo.hash_tree_compose(childHashes)

    walk(node)

    colorsizes = util.hist_dict(util.mget(colors, leaves)).values()

    if all_leaves:
        val = stats.factorial(len(leaves))
    else:
        val = 1
        for s in colorsizes:
            if s > 1:
                val *= stats.factorial(s)
    #print "py val=", val, "nmirrors=", nmirrors[0]
    return val / (2**nmirrors[0])
예제 #50
0
def num_redundant_topology(node, gene2species, leaves=None, all_leaves=False):
    """Returns the number of 'redundant' topologies"""

    if leaves is None:
        leaves = node.leaves()
    leaves = set(leaves)
    colors = {}
    nmirrors = [0]

    def walk(node):
        if node in leaves:
            colors[node] = phylo.hash_tree(node, gene2species)
        else:
            # recurse
            for child in node.children:
                walk(child)

            childHashes = util.mget(colors, node.children)
            if len(childHashes) > 1 and util.equal(*childHashes):
                nmirrors[0] += 1

            childHashes.sort()
            colors[node] = phylo.hash_tree_compose(childHashes)

    walk(node)

    colorsizes = util.hist_dict(util.mget(colors, leaves)).values()

    if all_leaves:
        val = stats.factorial(len(leaves))
    else:
        val = 1
        for s in colorsizes:
            if s > 1:
                val *= stats.factorial(s)
    # print "py val=", val, "nmirrors=", nmirrors[0]
    return val / (2 ** nmirrors[0])
def subalign(aln, cols):
    """Returns an alignment with a subset of the columns (cols)"""

    return mapalign(aln, valfunc=lambda x: "".join(util.mget(x, cols)))
예제 #52
0
파일: alignlib.py 프로젝트: ongkong/compbio
 def func(seq):
     dct = {-1: "-", 0: "0", 1: "1", 2: "2"}
     return "".join(util.mget(dct, mark_codon_pos(seq)))
예제 #53
0
파일: alignlib.py 프로젝트: ongkong/compbio
def subalign(aln, cols):
    """Returns an alignment with a subset of the columns (cols)"""

    return mapalign(aln, valfunc=lambda x: "".join(util.mget(x, cols)))