def make_degen_str(aln): """Returns a string containing the degeneracy for each column in an alignment """ degens = find_degen(aln) degenmap = {-1: " ", 0: "0", 1: "1", 2: "2", 3: "3", 4: "4"} return "".join(util.mget(degenmap, degens))
def chi_square_fit(cdf, params, data, ndivs=20, minsamples=5, plot=False, start=-util.INF, end=util.INF): from rasmus import gnuplot import scipy import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i + binsize] for i in xrange(0, len(data), binsize)] obs = scipy.array(map(len, bins)) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([ len(data) * (cdf(x[i + 1], params) - cdf(x[i], params)) for i in range(1, len(x) - 1) ]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) if plot: p = gnuplot.plot(util.mget(x, ind), obs) p.plot(util.mget(x, ind), expected) return chi2, pval
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5): sizes = [xbins[i + 1] - xbins[i] for i in xrange(len(xbins) - 1)] sizes.append(sizes[-1]) # NOTE: assumes bins are of equal size # only focus on bins that are large enough counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins) - 1)] expected = [] for i in xrange(len(xbins) - 1): expected.append( (func(xbins[i]) + func(xbins[i + 1])) / 2.0 * sizes[i] * nsamples) # ensure we have enough expected samples in each bin ind = util.find(util.gefunc(minsamples), expected) counts = util.mget(counts, ind) expected = util.mget(expected, ind) if len(counts) == 0: return [0, 1], counts, expected else: return chiSquare([counts], [expected], nparams), counts, expected
def optfunc(params): x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([ len(data) * (cdf(x[i + 1], params) - cdf(x[i], params)) for i in range(1, len(x) - 1) ]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) return chi2
def fit_distrib(cdf, params_init, data, ndivs=20, minsamples=5, start=-util.INF, end=util.INF): import scipy import scipy.optimize import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i + binsize] for i in xrange(0, len(data), binsize)] obs = scipy.array(map(len, bins)) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) def optfunc(params): x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([ len(data) * (cdf(x[i + 1], params) - cdf(x[i], params)) for i in range(1, len(x) - 1) ]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) return chi2 params = scipy.optimize.fmin(optfunc, params_init, disp=False) chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples) return list(params), pval
def get_matrix(self, rowheader="rlabels"): """Returns mat, rlabels, clabels where mat is a copy of the table as a 2D list rlabels are the row labels clabels are the column labels """ # get labels if rowheader is not None and rowheader in self.headers: rlabels = self.cget(rowheader) clabels = copy.copy(self.headers) clabels.remove(rowheader) else: rlabels = range(len(self)) clabels = copy.copy(self.headers) # get data mat = [] for row in self: mat.append(util.mget(row, clabels)) return mat, rlabels, clabels
def lookup(self, *keys, **options): """Returns a lookup dict based on a column 'key' or multiple keys extra options: default=None uselast=False # allow multiple rows, just use last """ options.setdefault("default", None) options.setdefault("uselast", False) lookup = util.Dict(dim=len(keys), default=options["default"]) uselast = options["uselast"] for row in self: keys2 = util.mget(row, keys) ptr = lookup for i in xrange(len(keys2) - 1): ptr = lookup[keys2[i]] if not uselast and keys2[-1] in ptr: raise Exception("duplicate key '%s'" % str(keys2[-1])) ptr[keys2[-1]] = row lookup.insert = False return lookup
def subalign(aln, cols): """Returns an alignment with a subset of the columns (cols)""" return mapalign(aln, valfunc=lambda x: "".join(util.mget(x, cols)))
def func(seq): dct = {-1: "-", 0: "0", 1: "1", 2: "2"} return "".join(util.mget(dct, mark_codon_pos(seq)))