def chi_square_fit(cdf, params, data, ndivs=20, minsamples=5, plot=False, start=-util.INF, end=util.INF): from rasmus import gnuplot import scipy import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i+binsize] for i in xrange(0, len(data), binsize)] obs = scipy.array(map(len, bins)) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([len(data) * (cdf(x[i+1], params) - cdf(x[i], params)) for i in range(1, len(x)-1)]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) if plot: p = gnuplot.plot(util.mget(x, ind), obs) p.plot(util.mget(x, ind), expected) return chi2, pval
def fit_distrib(cdf, params_init, data, ndivs=20, minsamples=5, start=-util.INF, end=util.INF): import scipy import scipy.optimize import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i+binsize] for i in xrange(0, len(data), binsize)] obs = scipy.array(map(len, bins)) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) def optfunc(params): x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([len(data) * (cdf(x[i+1], params) - cdf(x[i], params)) for i in range(1, len(x)-1)]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) return chi2 params = scipy.optimize.fmin(optfunc, params_init, disp=False) chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples) return list(params), pval
def read_length_matrix(filename, minlen=.0001, maxlen=1.0, nooutliers=True): """Read a length matrix made by spidir-prep""" from rasmus import util dat = [line.rstrip().split("\t") for line in open(filename)] species = dat[0][2:] lens = util.map2(float, util.submatrix(dat, range(1, len(dat)), range(2, len(dat[0])))) gene_sizes = map(int, util.cget(dat[1:], 1)) files = util.cget(dat[1:], 0) if nooutliers: treelens = map(sum, lens) m = mean(treelens) ind = util.find(lambda x: x<5*m, treelens) files, gene_sizes, lens, treelens = [util.mget(x, ind) for x in files, gene_sizes, lens, treelens] for row in lens: for i in xrange(len(row)): if row[i] < minlen: row[i] = minlen return species, lens, gene_sizes, files
def read_length_matrix(filename, minlen=.0001, maxlen=1.0, nooutliers=True): """Read a length matrix made by spidir-prep""" from rasmus import util dat = [line.rstrip().split("\t") for line in open(filename)] species = dat[0][2:] lens = util.map2( float, util.submatrix(dat, range(1, len(dat)), range(2, len(dat[0])))) gene_sizes = map(int, util.cget(dat[1:], 1)) files = util.cget(dat[1:], 0) if nooutliers: treelens = map(sum, lens) m = mean(treelens) ind = util.find(lambda x: x < 5 * m, treelens) files, gene_sizes, lens, treelens = [ util.mget(x, ind) for x in files, gene_sizes, lens, treelens ] for row in lens: for i in xrange(len(row)): if row[i] < minlen: row[i] = minlen return species, lens, gene_sizes, files
def fit_distrib(cdf, params_init, data, ndivs=20, minsamples=5, start=-util.INF, end=util.INF): import scipy import scipy.optimize import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i+binsize] for i in range(0, len(data), binsize)] obs = scipy.array(list(map(len, bins))) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) def optfunc(params): x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([len(data) * (cdf(x[i+1], params) - cdf(x[i], params)) for i in range(1, len(x)-1)]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) return chi2 params = scipy.optimize.fmin(optfunc, params_init, disp=False) chi2, pval = chi_square_fit(cdf, params, data, ndivs, minsamples) return list(params), pval
def chi_square_fit(cdf, params, data, ndivs=20, minsamples=5, plot=False, start=-util.INF, end=util.INF): from rasmus import gnuplot import scipy import scipy.stats # determine ndiv and binsize binsize = len(data) / ndivs if binsize < minsamples: ndivs = len(data) / minsamples binsize = len(data) / ndivs data = sorted(data) bins = [data[i:i+binsize] for i in range(0, len(data), binsize)] obs = scipy.array(list(map(len, bins))) ind = util.find(lambda x: x[-1] >= start and x[0] <= end, bins) obs = util.mget(obs, ind) x = [bin[0] for bin in bins] expected = [len(data) * cdf(x[1], params)] expected.extend([len(data) * (cdf(x[i+1], params) - cdf(x[i], params)) for i in range(1, len(x)-1)]) expected.append(len(data) * (1.0 - cdf(x[-1], params))) expected = scipy.array(util.mget(expected, ind)) chi2, pval = scipy.stats.chisquare(obs, expected) if plot: p = gnuplot.plot(util.mget(x, ind), obs) p.plot(util.mget(x, ind), expected) return chi2, pval
def remove_gapped_columns(aln): """Removes any column form an alignment 'aln' that contains a gap A new alignment is returned """ cols = zip(*aln.values()) ind = util.find(lambda col: "-" not in col, cols) return subalign(aln, ind)
def remove_gapped_columns(aln): """Removes any column form an alignment 'aln' that contains a gap A new alignment is returned """ cols = zip(* aln.values()) ind = util.find(lambda col: "-" not in col, cols) return subalign(aln, ind)
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5): sizes = [xbins[i + 1] - xbins[i] for i in xrange(len(xbins) - 1)] sizes.append(sizes[-1]) # NOTE: assumes bins are of equal size # only focus on bins that are large enough counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins) - 1)] expected = [] for i in xrange(len(xbins) - 1): expected.append( (func(xbins[i]) + func(xbins[i + 1])) / 2.0 * sizes[i] * nsamples) # ensure we have enough expected samples in each bin ind = util.find(util.gefunc(minsamples), expected) counts = util.mget(counts, ind) expected = util.mget(expected, ind) if len(counts) == 0: return [0, 1], counts, expected else: return chiSquare([counts], [expected], nparams), counts, expected
def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5): sizes = [xbins[i+1] - xbins[i] for i in xrange(len(xbins)-1)] sizes.append(sizes[-1]) # NOTE: assumes bins are of equal size # only focus on bins that are large enough counts = [ybins[i] * sizes[i] * nsamples for i in xrange(len(xbins)-1)] expected = [] for i in xrange(len(xbins)-1): expected.append((func(xbins[i]) + func(xbins[i+1]))/2.0 * sizes[i] * nsamples) # ensure we have enough expected samples in each bin ind = util.find(util.gefunc(minsamples), expected) counts = util.mget(counts, ind) expected = util.mget(expected, ind) if len(counts) == 0: return [0, 1], counts, expected else: return chiSquare([counts], [expected], nparams), counts, expected