Exemplo n.º 1
0
def make_degen_str(aln):
    """Returns a string containing the degeneracy for each column
    in an alignment
    """

    degens = find_degen(aln)
    degenmap = {-1: " ", 0: "0", 1: "1", 2: "2", 3: "3", 4: "4"}

    return "".join(util.mget(degenmap, degens))
Exemplo n.º 2
0
def chi_square_fit(cdf,
                   params,
                   data,
                   ndivs=20,
                   minsamples=5,
                   plot=False,
                   start=-util.INF,
                   end=util.INF):

    from rasmus import gnuplot
    import scipy
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i + binsize] for i in xrange(0, len(data), binsize)]
    obs = scipy.array(map(len, bins))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)

    x = [bin[0] for bin in bins]
    expected = [len(data) * cdf(x[1], params)]
    expected.extend([
        len(data) * (cdf(x[i + 1], params) - cdf(x[i], params))
        for i in range(1,
                       len(x) - 1)
    ])
    expected.append(len(data) * (1.0 - cdf(x[-1], params)))
    expected = scipy.array(util.mget(expected, ind))

    chi2, pval = scipy.stats.chisquare(obs, expected)

    if plot:
        p = gnuplot.plot(util.mget(x, ind), obs)
        p.plot(util.mget(x, ind), expected)

    return chi2, pval
Exemplo n.º 3
0
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5):
    sizes = [xbins[i + 1] - xbins[i] for i in xrange(len(xbins) - 1)]
    sizes.append(sizes[-1])  # NOTE: assumes bins are of equal size

    # only focus on bins that are large enough
    counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins) - 1)]

    expected = []
    for i in xrange(len(xbins) - 1):
        expected.append(
            (func(xbins[i]) + func(xbins[i + 1])) / 2.0 * sizes[i] * nsamples)

    # ensure we have enough expected samples in each bin
    ind = util.find(util.gefunc(minsamples), expected)
    counts = util.mget(counts, ind)
    expected = util.mget(expected, ind)

    if len(counts) == 0:
        return [0, 1], counts, expected
    else:
        return chiSquare([counts], [expected], nparams), counts, expected
Exemplo n.º 4
0
    def optfunc(params):
        x = [bin[0] for bin in bins]
        expected = [len(data) * cdf(x[1], params)]
        expected.extend([
            len(data) * (cdf(x[i + 1], params) - cdf(x[i], params))
            for i in range(1,
                           len(x) - 1)
        ])
        expected.append(len(data) * (1.0 - cdf(x[-1], params)))
        expected = scipy.array(util.mget(expected, ind))

        chi2, pval = scipy.stats.chisquare(obs, expected)
        return chi2
Exemplo n.º 5
0
def fit_distrib(cdf,
                params_init,
                data,
                ndivs=20,
                minsamples=5,
                start=-util.INF,
                end=util.INF):

    import scipy
    import scipy.optimize
    import scipy.stats

    # determine ndiv and binsize
    binsize = len(data) / ndivs
    if binsize < minsamples:
        ndivs = len(data) / minsamples
        binsize = len(data) / ndivs

    data = sorted(data)
    bins = [data[i:i + binsize] for i in xrange(0, len(data), binsize)]
    obs = scipy.array(map(len, bins))
    ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins)
    obs = util.mget(obs, ind)

    def optfunc(params):
        x = [bin[0] for bin in bins]
        expected = [len(data) * cdf(x[1], params)]
        expected.extend([
            len(data) * (cdf(x[i + 1], params) - cdf(x[i], params))
            for i in range(1,
                           len(x) - 1)
        ])
        expected.append(len(data) * (1.0 - cdf(x[-1], params)))
        expected = scipy.array(util.mget(expected, ind))

        chi2, pval = scipy.stats.chisquare(obs, expected)
        return chi2

    params = scipy.optimize.fmin(optfunc, params_init, disp=False)
    chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples)

    return list(params), pval
Exemplo n.º 6
0
    def get_matrix(self, rowheader="rlabels"):
        """Returns mat, rlabels, clabels

        where mat is a copy of the table as a 2D list
              rlabels are the row labels
              clabels are the column labels
        """
        # get labels
        if rowheader is not None and rowheader in self.headers:
            rlabels = self.cget(rowheader)
            clabels = copy.copy(self.headers)
            clabels.remove(rowheader)
        else:
            rlabels = range(len(self))
            clabels = copy.copy(self.headers)

        # get data
        mat = []
        for row in self:
            mat.append(util.mget(row, clabels))

        return mat, rlabels, clabels
Exemplo n.º 7
0
    def lookup(self, *keys, **options):
        """Returns a lookup dict based on a column 'key'
        or multiple keys

        extra options:
        default=None
        uselast=False    # allow multiple rows, just use last
        """
        options.setdefault("default", None)
        options.setdefault("uselast", False)
        lookup = util.Dict(dim=len(keys), default=options["default"])
        uselast = options["uselast"]

        for row in self:
            keys2 = util.mget(row, keys)
            ptr = lookup
            for i in xrange(len(keys2) - 1):
                ptr = lookup[keys2[i]]
            if not uselast and keys2[-1] in ptr:
                raise Exception("duplicate key '%s'" % str(keys2[-1]))
            ptr[keys2[-1]] = row

        lookup.insert = False
        return lookup
Exemplo n.º 8
0
def subalign(aln, cols):
    """Returns an alignment with a subset of the columns (cols)"""

    return mapalign(aln, valfunc=lambda x: "".join(util.mget(x, cols)))
Exemplo n.º 9
0
 def func(seq):
     dct = {-1: "-", 0: "0", 1: "1", 2: "2"}
     return "".join(util.mget(dct, mark_codon_pos(seq)))